diff --git a/.clang-format b/.clang-format
index 9ba433b17362424973626470d930356c2173dd84..aff93435f58c522f5ed1090aef2005f76e91cf31 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
-
diff --git a/.travis.yml b/.travis.yml
index c51e02eb79a9e53a2b8d1d663e8f0c3e0d8c3a61..e2d49daa1981396628efa5d16459eb70e9e76884 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 script:
   - |
     timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
   - |
     if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd3582a1bca199d62d19550ffdd1efe9db520fa7..e76512166fcaea5daf2a67d1259331b680f15b7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
         "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
     set(WITH_RDMA OFF CACHE STRING
         "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLDNN OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLML OFF CACHE STRING
-        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
 
     # Compile PaddlePaddle mobile inference library
     if (NOT WITH_C_API)
@@ -111,6 +108,14 @@ else()
     set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 
+set(WITH_MKLML ${WITH_MKL})
+if (WITH_MKL AND AVX2_FOUND)
+    set(WITH_MKLDNN ON)
+else()
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+    set(WITH_MKLDNN OFF)
+endif()
+
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -128,6 +133,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
+include(external/cares)
+include(external/grpc)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@@ -158,14 +165,15 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
-    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
-    endif(NOT WITH_DSO)
+  include(cuda)
 endif(WITH_GPU)
 
+if(WITH_MKLML)
+    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
+endif()
+
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 
 if(USE_NNPACK)
diff --git a/Dockerfile b/Dockerfile
index 150344a8116e2be9b5bab8e5fdcc9c37f4025020..857d3f3e5f64791146741ffb29feabfcb2ecbb84 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
     automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
     apt-get clean -y
 
 # Install Go and glide
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 040f5ffa41968cbf93a817faa1db86c18956341e..16c2390fd31bf1c79f29735fb98180d3f7302eb2 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -12,11 +12,11 @@ Machine:
 
 System: CentOS release 6.3 (Final), Docker 1.12.1.
 
-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
-
-- MKL-DNN tag v0.10
-- MKLML 2018.0.20170720
+PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+- MKL-DNN tag v0.11
+- MKLML 2018.0.1.20171007
 - OpenBLAS v0.2.20
+(TODO: will rerun after 0.11.0)
 	 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
 
@@ -31,17 +31,37 @@ Input image size - 3 * 224 * 224, Time: images/second
 
 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
+| MKLML        | 12.12 | 13.70 | 16.18  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
+
+
+chart on batch size 128
+TBD
+
+ - ResNet-50
+
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
+| MKLML        | 32.52 | 31.89 | 33.12  |
+| MKL-DNN      | 81.69 | 82.35 | 84.08  |
 
 
 chart on batch size 128
 TBD
 
- - ResNet
  - GoogLeNet
 
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 89.52 | 96.97 | 108.25 | 
+| MKLML        | 128.46| 137.89| 158.63 |
+| MKL-DNN      | 250.46| 264.83| 269.50 |
+
+chart on batch size 128
+TBD
+
 ### Laptop
 TBD
 ### Desktop
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index bc893bab98c4d2e07c62fbd012d51a0939db4766..a88ecac67d9e677f14f6dc24ba9a337b1245243f 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -5,6 +5,7 @@ height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+use_gpu = get_config_arg('use_gpu', bool, True)
 
 args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
 define_py_data_sources2(
@@ -16,6 +17,8 @@ settings(
     learning_method=MomentumOptimizer(0.9),
     regularization=L2Regularization(0.0005 * batch_size))
 
+conv_projection = conv_projection if use_gpu else img_conv_layer
+
 def inception2(name, input, channels, \
     filter1,
     filter3R, filter3,
@@ -138,7 +141,7 @@ def inception(name, input, channels, \
     cat = concat_layer(
         name=name,
         input=[cov1, cov3, cov5, covprj],
-        bias_attr=True,
+        bias_attr=True if use_gpu else False,
         act=ReluActivation())
     return cat
 
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
index 3cc779b48d082985f75ab1c053fbe262bc6d58aa..f768f6c29a84b40f917e0ccfde4d8c15f65c818b 100755
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -40,6 +40,7 @@ fi
 for use_mkldnn in True False; do
   for batchsize in 64 128 256; do
     train vgg 19 $batchsize $use_mkldnn
-    train resnet 50  $batchsize $use_mkldnn
+    train resnet 50 $batchsize $use_mkldnn
+    train googlenet v1 $batchsize $use_mkldnn
   done
 done
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 24ddb24399dabeec9b8e5faf36be3eb21f420111..e550ec285668ea25757eeee9e7c5dc48fc9d339d 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -76,27 +76,14 @@ else()
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
-if(WITH_MKLDNN)
-    add_definitions(-DPADDLE_USE_MKLDNN)
-    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
-        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
-        set(OPENMP_FLAGS "-fopenmp")
-        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-    else()
-        find_package(OpenMP)
-        if(OPENMP_FOUND)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        else()
-            message(WARNING "Can not find OpenMP."
-                 "Some performance features in MKLDNN may not be available")
-        endif()
-    endif()
-
-endif(WITH_MKLDNN)
+if (WITH_MKLML AND MKLML_IOMP_LIB)
+    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
+    set(OPENMP_FLAGS "-fopenmp")
+    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+endif()
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index 310450f7d009dc0cdae9c0079a96445af8ec8f95..d3f5bf6852b3b295f3b5806b0577a880b0ce6ba6 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
 # Set the architecture for iOS
 if(NOT DEFINED IOS_ARCH)
   if(IOS_PLATFORM STREQUAL "OS")
-    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
-    set(IOS_ARCH "arm64")
+    set(IOS_ARCH "armv7;armv7s;arm64")
   elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    # FIXME(liuyiqun): support "i386;x86_64" future
-    set(IOS_ARCH "x86_64")
+    set(IOS_ARCH "i386;x86_64")
   endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
 
 # Hidden visibilty is required for cxx on iOS 
 set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
 
 set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..6bea7cf3022242ce48cc882915f7e71810937283
--- /dev/null
+++ b/cmake/cuda.cmake
@@ -0,0 +1,188 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs7 "30 35 50 52")
+set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+########################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   select_nvcc_arch_flags(out_variable)
+function(select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND archs_names "Auto")
+  endif()
+
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " archs_names "${archs_names}")
+    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(cuda_arch_bin ${paddle_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    detect_installed_gpus(cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  list(REMOVE_DUPLICATES cuda_arch_bin)
+  list(REMOVE_DUPLICATES cuda_arch_ptx)
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if (${CUDA_VERSION} LESS 7.0)
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
+
+include_directories(${CUDA_INCLUDE_DIRS})
+list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+if(NOT WITH_DSO)
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+endif(NOT WITH_DSO)
+
+# setting nvcc arch flags
+select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# Set :expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e05111ee18efc906e39bcb56fb1be3b3c3dff5d6
--- /dev/null
+++ b/cmake/external/cares.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make
+    INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index c819eb4d70898e48eab499c666168d78262d4240..d4f252bb9f64c8db82b841fedf0817f5d8596501 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,15 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(yiwang): The annoying warnings mentioned in
-    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
-    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
-    # to fix it.  Before it gets accepted by the gflags team, we use
-    # my personal fork, which contains above fix, temporarily.  Let's
-    # change this back to the official Github repo once my PR is
-    # merged.
-    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
-    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..219ea1b90881ccdbaf3fd41510fb4f2a8b6ec0f4
--- /dev/null
+++ b/cmake/external/grpc.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make)
+ENDIF()
+
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.7.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD} HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5a06825beb73e85d8a55b7b578b187bee2c4340c..fc52d339d7a336b44c97f2e0a9fc8d6604854365 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
-    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
-    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
-    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
+    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
+ELSE()
+    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
 
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
@@ -57,15 +56,16 @@ ExternalProject_Add(
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
 
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+add_definitions(-DPADDLE_USE_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 225380798112ba5a15b5989b01207b1b072feedf..4c4f59656dae68739f2f07f3febd510e727fe2dd 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
             ENDIF()
         ELSEIF(IOS)
-            # FIXME(liuyiqun): support multiple architectures
-            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
                 SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+            ELSE()
+                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
             ENDIF()
         ELSEIF(RPI)
             # use hardfp
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index be7f6a9465970711170bd15dcecaadeaa8a55f86..7cfe1e68078eed023fd0cc6971c573bb0108b4cc 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
 
 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
     function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
         SET_PROTOBUF_VERSION()
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 8bd058222880b4df3b08da09c02f9fe7f1d0ee66..a8e1aca49c97df256b1269c286b0bce7732fa932 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)
 
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index a98e069b7cd1654ddd5868560d0905eab6d9c692..1638cd8fdfc34575132462859e056a1907f0b2f1 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -50,6 +50,8 @@ ExternalProject_Add(
 )
 
 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 
 IF(WITH_C_API)
   INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 4593ae6180b6d7deb61d897eb634b17ac0bb1683..2b125cef6aa8d1021afe8a7a0d232d84d36be4bc 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -149,58 +149,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
     safe_set_nvflag(${flag})
 endforeach()
-
-
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
-LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
-
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-
-function(specify_cuda_arch cuda_version cuda_arch)
-    if(${cuda_version} VERSION_GREATER "8.0")
-        foreach(capability 61 62)
-          if(${cuda_arch} STREQUAL ${capability})
-            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-          endif()
-        endforeach()
-    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
-        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-    endif()
-endfunction()
-
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
-endif()
-
-# Custom gpu architecture
-set(CUDA_ARCH)
-
-if(CUDA_ARCH)
-  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
-endif()
-
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
-
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index b9c1dde97bc444d793d67ff622fd6b13c6435a9a..c917ca0ff4e087b7caae8876da127bec6b39b798 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -459,11 +459,58 @@ function(py_test TARGET_NAME)
   if(WITH_TESTING)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
              COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             python2 ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 117ab7f49cdf4a568cd203b2b17767643d0b2d50..0dc33ce385175d1e2dc454d41db467d4b9d9cf9a 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
         target_link_libraries(${TARGET_NAME} log)
     endif(ANDROID)
 
-    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
-      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
+      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
     endif()
 
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
@@ -168,17 +168,3 @@ function(create_resources res_file output_file)
     COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
     DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
-
-
-# Create a python unittest using run_python_tests.sh,
-# which takes care of making correct running environment
-function(add_python_test TEST_NAME)
-    foreach(arg ${ARGN})
-        get_filename_component(py_fn ${arg} NAME_WE)
-        set(TRG_NAME ${TEST_NAME}_${py_fn})
-        add_test(NAME ${TRG_NAME}
-                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
-                python2 ${arg}
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-    endforeach()
-endfunction()
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 203506d7ab84e5a5be2232b077eac2d433a99766..c3f9c18d0663a7a24880b441981875c1e4f015aa 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -104,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -114,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
     :noindex:
-    
+
 Recurrent Layers
 ================
 
@@ -335,6 +335,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
     :noindex:
 
+dot_prod
+---------
+.. autoclass:: paddle.v2.layer.dot_prod
+    :noindex:
+
+out_prod
+--------
+.. autoclass:: paddle.v2.layer.out_prod
+    :noindex:
+
 power
 -----
 ..  autoclass:: paddle.v2.layer.power
@@ -372,6 +382,11 @@ cos_sim
 ..  autoclass:: paddle.v2.layer.cos_sim
     :noindex:
 
+l2_distance
+-----------
+..  autoclass:: paddle.v2.layer.l2_distance
+    :noindex:
+
 trans
 -----
 ..  autoclass:: paddle.v2.layer.trans
@@ -400,6 +415,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 
 Slicing and Joining Layers
 ==========================
diff --git a/doc/design/float16.md b/doc/design/float16.md
index 078801ba2ed969d26dd31d5ec4ed268686cf7016..1ea95ed6b5d6792171569b6ff76d09be92fcb13e 100644
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
 - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
 
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
 
 ## Implementation
 The float16 class holds a 16-bit `uint16_t` data internally.
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index 16236763a73770f3fe5eadf67645765d0456f875..ec6d4681836e189f46dbb9b915a237dc15cda7cf 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
 我们把集成方案大致分为了如下几个方面。
 
 ### CMake
-我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
 
-同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
 
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
 
-**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
 
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
index 320dccec3ddc7bfe6042f4e65b2518ea7b1ad24a..2cd4b6225b61cf374458e40afabad7745f61ba71 100644
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
@@ -1,25 +1,25 @@
 # Python Data Reader Design Doc
 
-At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
 
-- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
-- A *reader creator* is a function that returns a reader function.
-- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
-- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
 
-and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
 
 ## Data Reader Interface
 
-Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
 
 ```
 iterable = data_reader()
 ```
 
-Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
 
-An example implementation for single item data reader creator:
+An example implementation for single item data reader creator is as follows:
 
 ```python
 def reader_creator_random_image(width, height):
@@ -29,7 +29,7 @@ def reader_creator_random_image(width, height):
     return reader
 ```
 
-An example implementation for multiple item data reader creator:
+An example implementation for multiple item data reader creator is as follows:
 ```python
 def reader_creator_random_image_and_label(width, height, label):
     def reader():
@@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label):
 
 ## Batch Reader Interface
 
-*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
 
-Here are valid outputs:
 ```python
 # a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
 [(1, 1, 1),
@@ -58,20 +59,22 @@ Here are valid outputs:
 Please note that each item inside the list must be a tuple, below is an invalid output:
 ```python
  # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
- # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
- # or three column of datas, each of which is 1.
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
 [[1,1,1],
 [2,2,2],
 [3,3,3]]
 ```
 
-It's easy to convert from reader to batch reader:
+It is easy to convert from a reader to a batch reader:
+
 ```python
 mnist_train = paddle.dataset.mnist.train()
 mnist_train_batch_reader = paddle.batch(mnist_train, 128)
 ```
 
-Also easy to create custom batch reader:
+It is also straight forward to create a custom batch reader:
+
 ```python
 def custom_batch_reader():
     while True:
@@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader
 
 ## Usage
 
-batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
 
 ```python
 # two data layer is created:
@@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
 
 ## Data Reader Decorator
 
-*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
 
-Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
 
 ### Prefetch Data
 
-Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
 
 Use `paddle.reader.buffered` to prefetch data:
 
@@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
 
 ### Compose Multiple Data Readers
 
-For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
 
-We can do:
+We can do the following :
 
 ```python
 def reader_creator_random_image(width, height):
@@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False)
 
 reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
 # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
-# And we don't care second item at this time.
+# And we don't care about the second item at this time.
 paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
 ```
 
 ### Shuffle
 
-Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
 
 Example:
 ```python
@@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
 
 ## Q & A
 
-### Why reader return only a single entry, but not a mini batch?
+### Why does a reader return only a single entry, and not a mini batch?
 
-Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
 
-We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
 
-### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
 
-In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
 
-### Why use a dictionary but not a list to provide mapping?
+### Why use a dictionary instead of a list to provide mapping?
 
-We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
 
-### How to create custom data reader creator
+### How to create a custom data reader creator ?
 
 ```python
 def image_reader_creator(image_path, label_path, n):
@@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
 
 ### How is `paddle.train` implemented
 
-An example implementation of paddle.train could be:
+An example implementation of paddle.train is:
 
 ```python
 def train(batch_reader, mapping, batch_size, total_pass):
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index ac7e98ccf1aadbb973a4801fde842375cf63448c..2b4f921ae93c3b443ed62a28b1fa9fbda14f73ab 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -2,106 +2,70 @@
 
 ## Abstract
 
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
 
-1. Need to write special code to handle tasks which should only be run
-  by a single trainer. E.g., initializing model and saving model.
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
 
-2. Model parallelism is hard: need to write if-else branches conditioned
-  on the trainer ID to partition model onto each trainer, and manually
-  write the inter-model-shard communication code.
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
 
-3. The user can not directly specify the parameter update rule: need
-   to modify the parameter server C++ code and compile a new
-   binary. This adds complication for researchers: A lot of extra
-   effort is required. Besides, the training job submission program
-   may not allow running arbitrary binaries.
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
 
-This design doc discusses PaddlePaddle's new distributed training
-architecture that addresses the above limitations.
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
 
 ## Analysis
 
-We will assume the user writes the trainer program by Python, the same
-analysis holds if the trainer program is written in C++.
+The assumption is that the user writes the trainer program in either Python or C++.
 
 ### Limitation 1
 
-If we look at the Python code that the user writes, there are two
-kinds of functionalities:
+There are two basic functionalities in the trainer program:
 
-- The training logic such as load / save model and print log.
-- The neural network definition such as the definition of the data
-  layer, the fully connected layer, the cost function and the
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
   optimizer.
 
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
-replicated Python instances are running on different nodes: both the
-training logic and the neural network computation is replicated.
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
 
-The tasks that should only run once all belong to the training logic,
-if we only replicate the neural network computation, but do **not**
-replicate the training logic, the limitation could be solved.
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
 
 ### Limitation 2
 
-Model parallelism means running a single model on multiple nodes by
-partitioning the model onto different nodes and managing the
-inter-model-shard communications.
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
 
-PaddlePaddle should be able to modify the nerual network computation
-definition to support model parallelism automatically. However, the
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
 
-Just like compiler uses a intermediate representation (IR) so that
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
 
 <img src="src/compiler.png"/>
 
-We can have our own IR too: PaddlePaddle can support model parallel by
-converting the IR so the user no longer need to manually do it in
-Python:
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
 <img src="src/paddle-compile.png"/>
 
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
-the computation dependency graph and the variables used in the
-computation.
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
 ### Limitation 3
 
-The user can not directly specify the parameter update rule for the
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
 
-This could be fixed by making the parameter server run the same
-computation definition as the trainer. For a detailed explanation,
-please
-see
+This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
 [Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
 
 ## Distributed Training Architecture
 
-The new distributed training architecture can address the above
-limitations. Below is the illustration:
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
 <img src="src/distributed_architecture.png"/>
 
-The architecture includes major components: *PaddlePaddle Python*,
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
+The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
 
 ### PaddlePaddle Python
 
-PaddlePaddle Python is the Python library that user's Python trainer
-invoke to build the neural network topology, start training, etc.
+PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
 
 ```Python
 paddle.init()
@@ -117,102 +81,60 @@ for i in range(1000):
 	print cost_val
 ```
 
-The code above is a typical Python trainer code, the neural network
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
+The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
 
 #### session.eval
 
-As shown in the graph, `session.eval` sends the IR and the evaluation
-inputs/targets to the PaddlePaddle cluster for evaluation. The
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
+As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
+The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
 
-The Python `session` is a wrapper of the C++ `Session` class. For more
-information about `Session`, please
-see [Design Doc: Session](./session.md).
+The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
 
 ### PaddlePaddle Converter
 
-PaddlePaddle converter automatically converts the IR in the request
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
+The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
 
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
-   fetches the eval targets to the IR.
+1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
 
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
-   the boundary. The runtime does not need to run the OP that is not
-   dependent by the `fetch` OP.
+2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
 
-1. Optimizes the computation graph.
+3. Optimize the computation graph.
 
-1. Place the OPs in the graph onto different devices on different
-   PaddlePaddle runtime according to a placement algorithm and device
-   constraint specified by the user.
+4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
 
-1. Partition the graph according to runtime boundaries and add `send` /
-   `recv` OP pair on the runtime boundaries.
+5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
 
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+
+7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
 
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
-   results back to the converter, the convert reports the evaluation
-   results back to the PaddlePaddle Python.
-   
 The output IRs will be cached to optimize the conversion latency.
 
 
 #### Placement Algorithm
 
-Our first implementation will only support "trainer-parameter server"
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
 
-In the future, we will implement the general placement algorithm,
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
 
 
 ### PaddlePaddle Runtime
 
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
 
 
 ### Local Training Architecture
 
-The local training architecture will be the same as the distributed
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
 <img src="src/local_architecture.png"/>
 
 
 ### Training Data
 
-In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
+
+When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
 
 
 ## References:
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
deleted file mode 100644
index b473944fc7fb89d3e0a0b330933f2226734bb5bd..0000000000000000000000000000000000000000
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-经典的线性回归任务
-==================
-
-PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
-
-任务简介
---------
-
-我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
-
-一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
-
-准备数据
------------
-
-假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
-
-.. code-block:: python
-
-    # dataprovider.py
-    from paddle.trainer.PyDataProvider2 import *
-    import random
-
-    # 定义输入数据的类型: 2个浮点数
-    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-    def process(settings, input_file):
-        for i in xrange(2000):
-            x = random.random()
-            yield [x], [2*x+0.3]
-
-训练模型
------------
-
-为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-.. code-block:: python
-
-    # trainer_config.py
-    from paddle.trainer_config_helpers import *
-
-    # 1. 定义数据来源，调用上面的process函数获得观测数据
-    data_file = 'empty.list'
-    with open(data_file, 'w') as f: f.writelines(' ')
-    define_py_data_sources2(train_list=data_file, test_list=None, 
-                            module='dataprovider', obj='process',args={})
-
-    # 2. 学习算法。控制如何改变模型参数 w 和 b
-    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-    # 3. 神经网络配置
-    x = data_layer(name='x', size=1)
-    y = data_layer(name='y', size=1)
-    # 线性计算网络层: ȳ = wx + b
-    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-    # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = square_error_cost(input= ȳ, label=y)
-    outputs(cost)
-
-
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
-    
-    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
-    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
-
-定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
-
-.. code-block:: bash
-
-    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
-
-模型检验
------------
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-.. code-block:: python
-
-    import numpy as np
-    import os
-
-    def load(file_name):
-        with open(file_name, 'rb') as f:
-            f.read(16) # skip header for float type.
-            return np.fromfile(f, dtype=np.float32)
-        
-    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-    # w=1.999743, b=0.300137
-
-.. image:: ./parameters.png
-     :align: center
-     :scale: 80 %
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
-
-这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
deleted file mode 100644
index 2cc438ebbe0f97345d25354b93b4ebbd43502415..0000000000000000000000000000000000000000
--- a/doc/getstarted/basic_usage/index_en.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-Simple Linear Regression
-========================
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-Problem Background
-------------------
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-Prepare the Data
------------------
-
-Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-    .. code-block:: python
-
-        # dataprovider.py
-        from paddle.trainer.PyDataProvider2 import *
-        import random
-
-        # define data types of input: 2 real numbers
-        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-        def process(settings, input_file):
-            for i in xrange(2000):
-                x = random.random()
-                yield [x], [2*x+0.3]
-
-Train a NeuralNetwork
-----------------------
-
-To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
-
-    .. code-block:: python
-
-        # trainer_config.py
-        from paddle.trainer_config_helpers import *
-
-        # 1. read data. Suppose you saved above python code as dataprovider.py
-        data_file = 'empty.list'
-        with open(data_file, 'w') as f: f.writelines(' ')
-        define_py_data_sources2(train_list=data_file, test_list=None, 
-                module='dataprovider', obj='process',args={})
-
-        # 2. learning algorithm
-        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-        # 3. Network configuration
-        x = data_layer(name='x', size=1)
-        y = data_layer(name='y', size=1)
-        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = square_error_cost(input=y_predict, label=y)
-        outputs(cost)
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
-
-    .. code-block:: bash
- 
-        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- 
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-Evaluate the Model
--------------------
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
-
-    .. code-block:: python
-
-        import numpy as np
-        import os
-
-        def load(file_name):
-            with open(file_name, 'rb') as f:
-                f.read(16) # skip header for float type.
-                return np.fromfile(f, dtype=np.float32)
-                
-        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-        # w=1.999743, b=0.300137
-
-    .. image:: parameters.png
-        :align: center
-
-Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png
deleted file mode 100644
index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000
Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3c525bdad6f6118dcd560e2cb7bfaf89737c1362
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,141 @@
+从源码编译
+======================
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   # 如果不使用Docker编译环境，执行下面的命令
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install python/dist/*.whl
+
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+
+如果不使用Docker，可以执行ctest命令即可：
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+
+.. _compile_deps:
+
+编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
deleted file mode 100644
index 2f1461489495618718d5abaeab9cbeda9b93700f..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ /dev/null
@@ -1,236 +0,0 @@
-Installing from Sources
-==========================
-
-* [1. Download and Setup](#download)
-* [2. Requirements](#requirements)
-* [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Centos](#centos)
-
-
-## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-```
-## <span id="requirements">Requirements</span>
-
-To compile the source code, your computer must be equipped with the following dependencies.
-
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
-- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
-- **BLAS**: MKL, OpenBlas or ATLAS
-- **Python**: only support Python 2.7
-- **Go**
-
-**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
-For CUDA 8.0, GCC versions later than 5.3 are not supported!
-
-### Options
-
-PaddlePaddle supports some build options. 
-
-<html>
-<table> 
-<thead>
-<tr>
-<th scope="col" class="left">Optional</th>
-<th scope="col" class="left">Description</th>
-</tr>
-</thead>
-<tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
-<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
-<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
-<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
-<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
-<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
-<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
-</tbody>
-</table>
-</html>
-
-**Note:**
-  - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
-  - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
-  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
-
-As a simple example, consider the following:  
-
-1. **BLAS Dependencies(optional)**
-  
-    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
-    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
-
-    ```bash
-    # specify MKL
-    cmake .. -DMKL_ROOT=<mkl_path>
-    # or specify OpenBLAS
-    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    ```
-
-2. **Doc Dependencies(optional)**
-
-    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
-
-    ```bash
-    pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme recommonmark
-
-    # install doxygen on Ubuntu
-    sudo apt-get install doxygen 
-    # install doxygen on Mac OS X
-    brew install doxygen
-
-    # active docs in cmake
-    cmake .. -DWITH_DOC=ON`
-    ```
-
-## <span id="ubuntu">Build on Ubuntu 14.04</span>
-
-### Install Dependencies
-
-- **Paddle Dependencies**
-
-    ```bash
-    # necessary
-    sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
-    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
-    sudo pip install 'protobuf==3.1.0.post1'
-
-    # Install Go
-    # You can follow https://golang.org/doc/install for a detailed explanation.
-    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C $HOME -xzf go.tgz && \
-    mkdir $HOME/gopath && \
-    rm go.tgz
-
-    # Setup environment variables
-    export GOROOT=$HOME/go
-    export GOPATH=$HOME/gopath
-    export PATH=$PATH:$GOROOT/bin
-
-    # install cmake 3.4
-    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
-        cd .. && rm -rf cmake-3.4.1
-    ```
-
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
-
-## <span id="centos">Build on Centos 7</span>
-
-### Install Dependencies
-
-- **CPU Dependencies**
-
-    ```bash
-    # necessary
-    sudo yum update
-    sudo yum install -y epel-release
-    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
-    sudo pip install wheel numpy
-    sudo pip install 'protobuf>=3.0.0'
-    ```
-  
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-  
-```bash
-# you can add build option here, such as:    
-cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..76fbc43de2e83580dd79b874507c103533022436
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -0,0 +1,159 @@
+Build from Sources
+==========================
+
+.. _build_step:
+
+How To Build
+----------------
+
+PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
+tools. We recommend you to use our pre-built Docker image to run the build
+to avoid installing dependencies by yourself. We have several build environment
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
+
+Then run:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # run the following command to build a CPU-Only binaries if you are using docker
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   # else run these commands
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install python/dist/*.whl
+
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+
+If you don't use Docker, just run ctest will start the tests:
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+
+
+.. _compile_deps:
+
+Compile Dependencies
+----------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Build Options
+----------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+
+.. _build_options_bool:
+
+Bool Type Options
+----------------
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_DOC", "Build documentaions", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png
deleted file mode 100644
index a58cd09ad99cf27cc1ca5785fe54d726b83a82f6..0000000000000000000000000000000000000000
Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
deleted file mode 100644
index be0c1ffa451b2901ec06621dd4d886f800b4562e..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
deleted file mode 100644
index a6356baf16a0d3d2499e39d2055d8ee878dcaef2..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-编译选项,描述,注意
-MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
-ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
-OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
-REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
deleted file mode 100644
index 463b825470579d0c3736a408b1e82dd33e6f8d42..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ /dev/null
@@ -1,12 +0,0 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
-WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 0d34dec8e908c5e61001500725187a2233797f46..f78b1fb0e11aa028a4b7abb5270740b97f8039e9 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,222 +1,139 @@
-PaddlePaddle的Docker容器使用方式
+使用Docker安装运行
 ================================
 
-PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
 
-Docker使用入门
-------------------------------
-
-几个基础的概念帮助理解和使用Docker：
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
 
-- *镜像*：一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行：
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
 
-  .. code-block:: bash
+.. _docker_pull:
 
-     docker images
+获取PaddlePaddle的Docker镜像
+------------------------------
 
-  来列出当前系统中的所有镜像，同样可以执行：
+执行下面的命令获取最新的PaddlePaddle Docker镜像
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
+     docker pull paddlepaddle/paddle
 
-- *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
-  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
-  可以执行：
+对于国内用户，我们提供了加速访问的镜像源：
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  来使用一个镜像启动一个容器。
-
-- 默认情况下，Docker容器会运行在独立的文件系统空间之上，我们无法在Docker容器中
-  访问到主机上的文件。可以通过*挂载Volume*的方式，将主机上的文件或目录挂载到
-  Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下，容器使用
-  debian镜像，并且启动后执行 :code:`ls /data`。
+下载GPU版本的Docker镜像：
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-PaddlePaddle发布的Docker镜像使用说明
-------------------------------
-
-我们把PaddlePaddle的编译环境打包成一个镜像，称为开发镜像，里面涵盖了
-PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
-像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
-PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
-行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 
-和国内镜像`docker.paddlepaddle.org` 提供最新
-的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
-
-**注意：为了方便在国内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您在国内，请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
-
-1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
-
-   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
-   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
-   开发镜像包含了以下工具：
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-   很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
-   也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
-
-   以交互容器方式运行开发镜像：
-
-   .. code-block:: bash
-
-      docker run -it --rm -v $(pwd):/paddle  paddlepaddle/paddle:0.10.0-dev /bin/bash
-
-   或者，可以以后台进程方式运行容器：
-
-   .. code-block:: bash
-
-      docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D
-
-   然后用密码 :code:`root` SSH进入容器：
-
-   .. code-block:: bash
-
-      ssh -p 2202 root@localhost
-
-   SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
-
-2. 生产镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   如果输出是No，就需要选择使用no-AVX的镜像
-
-   **注：在0.10.0之后的版本，PaddlePaddle都可以自动判断硬件是否支持AVX，所以无需判断AVX即可使用**
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
-   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
+选择下载使用不同的BLAS库的Docker镜像：
 
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
+  .. code-block:: bash
 
-   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   .. code-block:: bash
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
 
-      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
+  .. code-block:: bash
 
-3. 运行以及发布您的AI程序
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
+.. _docker_run:
 
-   .. code-block:: bash
+在Docker中执行PaddlePaddle训练程序
+------------------------------
 
-      docker run -it -v $PWD:/work paddle /work/a.py
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
 
-   如果要使用GPU，请运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
 
-      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
 
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
-   创建和发布自己的AI程序镜像。
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
 
-运行PaddlePaddle Book
----------------------
+.. _docker_run_book:
 
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+使用Docker启动PaddlePaddle Book教程
+------------------------------
 
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
 
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 然后在浏览器中输入以下网址：
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 就这么简单，享受您的旅程！
 
-通过Docker容器开发PaddlePaddle
-------------------------------
-
-开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+.. _docker_run_gpu:
 
-1. 制作PaddlePaddle开发镜像
-
-   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
-   生成Docker镜像的方式有两个，一个是直接把一个容器转换成镜像，另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷，适合自己实验，可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚，其他人很容易看懂镜像生成过程，持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行：
-
-   .. code-block:: bash
-      
-      git clone https://github.com/PaddlePaddle/Paddle.git
-      cd Paddle
-      docker build -t paddle:dev .
-
-   docker build这个命令的-t指定了生成的镜像的名字，这里我们用paddle:dev。到此，PaddlePaddle开发镜像就被构建完毕了。
+使用Docker执行GPU训练
+------------------------------
 
-2. 制作PaddlePaddle生产镜像
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
 
-   生产镜像的生成分为两步，第一步是运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
 
-   以上命令会编译PaddlePaddle，生成运行程序，以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU，“WITH_AVX”控制生成的生产镜像是否支持AVX，”WITH_TEST“控制是否生成单元测试。
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
 
-   第二步是运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker build -t paddle:prod -f build/Dockerfile ./build
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
 
-   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置，最终生成名为paddle:prod的生产镜像。
+**关于AVX：**
 
-3. 运行单元测试
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
 
-   运行以下指令：
+以下指令能检查Linux电脑是否支持AVX：
 
    .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-
-文档
-----
-
-Paddle的Docker开发镜像带有一个通过 `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
-只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
-
-.. code-block:: bash
-
-   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 94860240f6a4a9bed8a865684a8a79960489280e..d7acc7aeb744b19d83acb520d07c8551168dd096 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -1,270 +1,146 @@
-PaddlePaddle in Docker Containers
+Run in Docker Containers
 =================================
 
-Docker container is currently the only officially-supported way to
-running PaddlePaddle.  This is reasonable as Docker now runs on all
-major operating systems including Linux, Mac OS X, and Windows.
-Please be aware that you will need to change `Dockers settings
-<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
-of your hardware resource on Mac OS X and Windows.
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
 
-Working With Docker
--------------------
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
 
-Docker is simple as long as we understand a few basic concepts:
+After you've read above tutorials you may proceed the following steps.
 
-- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+.. _docker_pull:
 
-  .. code-block:: bash
-
-     docker images
+Pull PaddlePaddle Docker Image
+------------------------------
 
-  to list all images in the system. We can also run
+Run the following command to download the latest Docker images:
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  to download a Docker image, paddlepaddle/paddle in this example,
-  from Dockerhub.com.
+     docker pull paddlepaddle/paddle
 
-- *container*: considering a Docker image a program, a container is a
-  "process" that runs the image. Indeed, a container is exactly an
-  operating system process, but with a virtualized filesystem, network
-  port space, and other virtualized environment. We can type
+For users in China, we provide a faster mirror:
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  to start a container to run a Docker image, paddlepaddle/paddle in this example.
-
-- By default docker container have an isolated file system namespace,
-  we can not see the files in the host file system. By using *volume*,
-  mounted files in host will be visible inside docker container.
-  Following command will mount current dirctory into /data inside
-  docker container, run docker container from debian image with
-  command :code:`ls /data`.
+Download GPU version images:
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-Usage of CPU-only and GPU Images
-----------------------------------
-
-We package PaddlePaddle's compile environment into a Docker image,
-called the develop image, it contains all compiling tools that
-PaddlePaddle needs. We package compiled PaddlePaddle program into a
-Docker image as well, called the production image, it contains all
-runtime environment that running PaddlePaddle needs. For each version
-of PaddlePaddle, we release both of them. Production image includes
-CPU-only version and a CUDA GPU version and their no-AVX versions.
-
-We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. 
-
-** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
-
-
-1. development image :code:`paddlepaddle/paddle:<version>-dev`
-
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-     
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-
-2. Production images, this image might have multiple variants:
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   Please be aware that the CPU-only and the GPU images both use the
-   AVX instruction set, but old computers produced before 2008 do not
-   support AVX.  The following command checks if your Linux computer
-   supports AVX:
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   **NOTE：versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
-   To run the CPU-only image as an interactive container:
-
-   .. code-block:: bash
-
-      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
-
-   Above method work with the GPU image too -- the recommended way is
-   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
-
-   Please install nvidia-docker first following this `tutorial
-   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
-
-   Now you can run a GPU image:
-
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
-
-
-Train Model Using Python API
-----------------------------
-
-Our official docker image provides a runtime for PaddlePaddle
-programs. The typical workflow will be as follows:
-
-Create a directory as workspace:
-
-.. code-block:: bash
-
-   mkdir ~/workspace
-
-Edit a PaddlePaddle python program using your favourite editor
-
-.. code-block:: bash
-
-   emacs ~/workspace/example.py
-
-Run the program using docker:
-
-.. code-block:: bash
-
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
-
-Or if you are using GPU for training:
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-.. code-block:: bash
+Choose between different BLAS version:
 
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
-
-Above commands will start a docker container by running :code:`python
-/workspace/example.py`. It will stop once :code:`python
-/workspace/example.py` finishes.
-
-Another way is to tell docker to start a :code:`/bin/bash` session and
-run PaddlePaddle program interactively:
-
-.. code-block:: bash
-
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-Running with GPU is identical:
-
-.. code-block:: bash
-
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-
-Develop PaddlePaddle or Train Model Using C++ API
----------------------------------------------------
-
-We will be using PaddlePaddle development image since it contains all
-compiling tools and dependencies.
+  .. code-block:: bash
 
-1. Build PaddlePaddle develop image
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   Use following command to build PaddlePaddle develop image:
 
-   .. code-block:: bash
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
 
-      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
-      docker build -t paddle:dev .
-
-2. Build PaddlePaddle production image
+  .. code-block:: bash
 
-   There are two steps for building production image, the first step is to run:
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   .. code-block:: bash
+.. _docker_run:
 
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+Launch your training program in Docker
+------------------------------
 
-   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
 
-   The second step is to run:
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
 
-      docker build -t paddle:prod -f build/Dockerfile ./build
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
 
-   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
+Also, you can go into the container shell, run or debug your code
+interactively:
 
-3. Run unit test
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   Following command will run unit test:
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
 
-   .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+.. _docker_run_book:
 
 PaddlePaddle Book
 ------------------
 
-The Jupyter Notebook is an open-source web application that allows
-you to create and share documents that contain live code, equations,
-visualizations and explanatory text in a single browser.
-
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
-We already exposed port 8888 for this book. If you want to
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
 dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
 
 We provide a packaged book image, simply issue the command:
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 Then, you would back and paste the address into the local browser:
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 That's all. Enjoy your journey!
 
+.. _docker_run_gpu:
 
-Documentation
--------------
+Train with Docker with GPU
+------------------------------
 
-Paddle Docker images include an HTML version of C++ source code
-generated using `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
-for users to browse and understand the C++ source code.
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
 
-As long as we give the Paddle Docker container a name, we can run an
-additional Nginx Docker container to serve the volume from the Paddle
-container:
+  .. code-block:: bash
 
-.. code-block:: bash
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
 
-   docker run -d --name paddle-cpu-doc paddle:<version>
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
 
+  .. code-block:: bash
 
-Then we can direct our Web browser to the HTML version of source code
-at http://localhost:8088/paddle/
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index dd9923697ab85825557aa89a08870bece7c76673..88c5142ddee994ed0c0dc520195311e97f5a549e 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,12 +6,13 @@
 安装流程
 ++++++++
 
-PaddlePaddle提供Docker镜像来部署环境。
+PaddlePaddle提供pip和Docker的安装方式：
 
 .. toctree::
    :maxdepth: 1
-   
-   docker_install_cn.rst 
+
+   pip_install_cn.rst
+   docker_install_cn.rst
 
 
 编译流程
@@ -19,9 +20,14 @@ PaddlePaddle提供Docker镜像来部署环境。
 
 ..  warning::
 
-    编译流程主要推荐高级用户查看，普通用户请走安装流程。
+    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
 
 ..  toctree::
     :maxdepth: 1
 
-    cmake/build_from_source_cn.rst
+    build_from_source_cn.rst
+
+常见问题解答
+++++++++++
+
+`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index 8a53588e0439df8f4d5fd529b7a20262c67d4e58..c8b60d03578ba6a9b73134ec53b440d057e36079 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -1,22 +1,33 @@
 Install and Build
 =================
 
-Install PaddlePaddle
-----------------------
+.. _install_steps:
 
-..  toctree::
-    :maxdepth: 1
+Install Steps
+++++++++
+
+You can choose either pip or Docker to complete your install:
+
+.. toctree::
+   :maxdepth: 1
+
+   pip_install_en.rst
+   docker_install_en.rst
 
-    docker_install_en.rst
 
 Build from Source
 -----------------
 
 ..  warning::
 
-    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
 
 ..  toctree::
     :maxdepth: 1
 
     build_from_source_en.md
+
+FAQ
+++++++++++
+
+`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b26bf4c95cb18f36408eb75894e8b9b674efc67b
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -0,0 +1,86 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+如果需要安装支持GPU的版本，需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..113790e4e4ca116e91f11f8a233eae874d9d1b7a
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -0,0 +1,104 @@
+Install Using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install Using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+If you wish to install GPU version, just run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+If you wish to install the latest develop branch PaddlePaddle, 
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index aa418c657a4ba16cce61c030066f4d3e14e891cc..a9087be6f350c5656cabb0c64ba0f200d1c666cc 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,10 +1,61 @@
 新手入门
 ============
 
+.. _quick_install:
+
+快速安装
+++++++++
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本，需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_cn.rst
-  concepts/use_concepts_cn.rst
 
-- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
+.. _quick_start:
+
+快速开始
+++++++++
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index be3253e3d41b99a2b696e2c5ef6463ed49680d69..d14e3f5c0cc90792fce9cb82e65da482c44dc433 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,9 +1,61 @@
 GET STARTED
 ============
 
+.. _quick_install:
+
+Quick Install
+----------------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version, run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build:
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_en.rst
 
-- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
+
+.. _quick_start:
+
+Quick Start
+++++++++
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 6e4e27dd00edce2497ba0c11f5c44f284855fa31..3dddbbe506d4a3f6b669eabd6c5605640d29f9ef 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -72,7 +72,7 @@ PaddlePaddle的文档构建有三种方式。
     cd TO_YOUR_PADDLE_CLONE_PATH
     mkdir -p build
     cd build
-    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
     make gen_proto_py
     make paddle_docs paddle_docs_cn
 
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index fbf0d2d3ae7597b87d014a746be540b067798a44..61bf25ccd12eeedffc747fdd4ce84fa4adde07ee 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,7 +18,6 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_en.rst
   dev/new_layer_en.rst
   dev/contribute_to_paddle_en.md
   dev/write_docs_en.rst
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3330b0b59d65d81d565d553349c39945ef82e42
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -0,0 +1,163 @@
+此教程会介绍如何使用Python的cProfile包，与Python库yep，google perftools来运行性能分析(Profiling)与调优。
+
+运行性能分析可以让开发人员科学的，有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中，真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。
+
+性能优化的步骤，通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。
+
+Paddle提供了Python语言绑定。用户使用Python进行神经网络编程，训练，测试。Python解释器通过`pybind`和`swig`调用Paddle的动态链接库，进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分:
+
+* Python代码的性能分析
+* Python与C++混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。
+
+### 查看性能分析文件
+
+当main.py运行完毕后，性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来。
+
+使用`pip install cprofilev`安装`cprofilev`工具。安装完成后，使用如下命令开启HTTP服务
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+访问对应网址，即可显示性能分析的结果。性能分析结果格式如下:
+
+```text
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
+
+## 总结
+
+至此，两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式，Paddle的开发人员和使用人员可以有次序的，科学的发现和解决性能问题。
diff --git a/doc/howto/optimization/pprof_1.png b/doc/howto/optimization/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/howto/optimization/pprof_1.png differ
diff --git a/doc/howto/optimization/pprof_2.png b/doc/howto/optimization/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/howto/optimization/pprof_2.png differ
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index 882066f23714f7ab3bba9199b5fa5ff2325ce849..424d7718c64438496cf0895397babd5408e1ca02 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -1,4 +1,4 @@
-# 构建Android平台上的PaddlePaddle库
+# Android平台编译指南
 
 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
 - 基于Docker容器的编译方式
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
index cda636a67de712e072f4cc7ad859dda75211eaa8..9da48e7f2119ce901fbb3abab73400df27be16d2 100644
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -1,4 +1,4 @@
-# 构建iOS平台上的PaddlePaddle库
+# iOS平台编译指南
 交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。
 
 ## 准备交叉编译环境
@@ -25,7 +25,7 @@ iOS平台可选配置参数：
 - `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
   - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
   - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
-- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
 
     <table class="docutils">
     <colgroup>
@@ -41,11 +41,11 @@ iOS平台可选配置参数：
     <tbody valign="top">
       <tr class="row-even">
       <td>OS</td>
-      <td>armv7, armv7s, arm64 (默认)</td>
+      <td>armv7, armv7s, arm64 </td>
     </tr>
     <tr class="row-odd">
       <td>SIMULATOR</td>
-      <td>i386, x86_64 (默认)</td>
+      <td>i386, x86_64 </td>
     </tr>
     </tbody>
     </table>
@@ -66,7 +66,7 @@ iOS平台可选配置参数：
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=iOS \
       -DIOS_PLATFORM=OS \
-      -DIOS_ARCH="arm64" \
+      -DIOS_ARCH="armv7;arm64" \
       -DIOS_ENABLE_BITCODE=ON \
       -DIOS_USE_VECLIB_FOR_BLAS=ON \
       -DCMAKE_INSTALL_PREFIX=your/path/to/install \
@@ -112,6 +112,6 @@ $ make install
 - `lib`目录，其中包含PaddlePaddle的C-API静态库
 - `third_party`目录，其中包含所依赖的所有第三方库
 
-注意，不同架构的PaddlePaddle库建议安装到不同的目录下，然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
+注意，如果PaddlePaddle库需要同时支持真机和模拟器，则需要分别编译真机和模拟器版本，然后使用`lipo`工具合并fat库。
 
 自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md
index 6e983645faaed1f67edaeeb82ddbef9cef6bb85f..f8ef9dc8031613831437745995268f3abc392f5b 100644
--- a/doc/mobile/cross_compiling_for_raspberry_cn.md
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
@@ -1,4 +1,4 @@
-# 构建Raspberry Pi平台上的PaddlePaddle库
+# Raspberry Pi平台编译指南
 
 通常有两个方法来构建基于 Rasspberry Pi 的版本：
 
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d5b55e1c95f248f551e6a0a3b39123169dd7784f..30f3a766f0c65187c8f2dd4603e3d26c9b9a6a3d 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
   if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
@@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
   if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 876af2aa7615c098d225b56ce2ea0b1529a6e3c6..5eeaf7e31fac7c9ed0b9269e74a7e467bde155ef 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -1,5 +1,6 @@
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"
 
 #define CONFIG_BIN "./trainer_config.bin"
@@ -27,20 +28,19 @@ int main() {
   CHECK(paddle_arguments_resize(in_args, 1));
 
   // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
                                            /* size */ 784,
                                            /* useGPU */ false);
   srand(time(0));
 
-  std::vector<paddle_real> input;
-  input.resize(784 * 10);
+  paddle_real* array;
+
+  // Get First row.
+  CHECK(paddle_matrix_get_row(mat, 0, &array));
 
-  for (int i = 0; i < input.size(); ++i) {
-    input[i] = rand() / ((float)RAND_MAX);
+  for (int i = 0; i < 784; ++i) {
+    array[i] = rand() / ((float)RAND_MAX);
   }
-  
-  // Set value for the input matrix
-  CHECK(paddle_matrix_set_value(mat, input.data()));
 
   CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
@@ -53,17 +53,18 @@ int main() {
 
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
-  std::std::vector<paddle_real> result;
-  int height;
-  int width;
+  uint64_t height;
+  uint64_t width;
 
-  CHECK(paddle_matrix_get_shape(prob, &height, &width);
-  result.resize(height * width);
-  CHECK(paddle_matrix_get_value(prob, result.data()));
+  CHECK(paddle_matrix_get_shape(prob, &height, &width));
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
 
-  printf("Prob: ");
+  printf("Prob: \n");
   for (int i = 0; i < height * width; ++i) {
-    printf("%.2f ", result[i]);
+    printf("%.4f ", array[i]);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
   }
   printf("\n");
 
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index 01b8bad2ee9f528f8622346f43b9ff82225a7e73..8cc3e0034e058daefc63c69efe0b1f575c586897 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
  * @note  value should contain enough element of data to init the mat
  */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);
 
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
@@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           paddle_real** rawRowBuffer);
 
 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
  * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
  * @return paddle_error
  * @note the space of the result should allocated before invoke this API
  */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index ede2670882ee2b93f610a2261a4ecc1784bc2d0c..4ab8de80d1c7be0f8e3eb848955373dd5e21bc18 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,7 +25,9 @@ limitations under the License. */
 #include "hl_matrix.h"
 #include "hl_sequence.h"
 #include "hl_sparse.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "hl_warpctc_wrap.h"
+#endif
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_aggregate_stub.h"
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index c08e844847737b1172f6453767cc7f5e7b1a2bda..4b0eff3adb6fff0c9599b8613c5f19daea840674 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -6,7 +6,10 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
 cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
@@ -51,10 +54,6 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
-
-cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
-cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
-
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 00d9dd238ec5328be28f58f8118daad3a039e08c..8fd2906107c490eee129fc10262df28bfa67800b 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -22,7 +22,6 @@
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 
 namespace paddle {
@@ -218,21 +217,6 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                      return false;
                    });
 
-    // process recurrent gradient op as a special operator.
-    if (forwardOp.Type() == "dynamic_recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
-      // or this will result in infinite loop.
-      const auto& rnnop =
-          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
-      auto rnn_grad_op =
-          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
-      const auto& stepnet_op =
-          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
-      // create stepnet's gradient op
-      rnn_grad_op->rnn.SetStepUnit(
-          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
-    }
-
     if (net->ops_.empty()) {  // Current no aux op is added to network
       return grad_op;
     }
@@ -513,21 +497,16 @@ ParamGradInfoMap AppendBackward(
   const int root_block_idx = 0;
   auto root_block = program_desc.MutableBlock(root_block_idx);
 
-  // insert fill one op for target
-  // TODO(qiao) add some check to the target.
   std::string fill_one_op_out = GradVarName(target.Name());
-  std::vector<int64_t> target_shape_desc = target.Shape();
-  std::vector<int> target_shape;
-  std::transform(target_shape_desc.begin(), target_shape_desc.end(),
-                 std::back_inserter(target_shape),
-                 [](int64_t dim) { return static_cast<int>(dim); });
+  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
+  PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
   std::unique_ptr<OpDescBind> fill_one_op(
       new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                     {{"shape", target_shape},
+                     {{"shape", std::vector<int>{1}},
                       {"value", static_cast<float>(1.0)},
-                      {"data_type", target.GetDataType()}}));
+                      {"dtype", target.GetDataType()}}));
   // infer var type of fill_one_op
   fill_one_op->InferVarType(root_block);
 
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index d485cdf6109274377ad0057223bdd8401e964aa7..2b858f5ea0874d7bf1a9cf38529f5d0d70cca7f2 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) {
   op->SetOutput("Out", {"out"});
 
   auto target = f::VarDescBind("out");
+  target.SetShape({1});
   auto var_to_grad = AppendBackward(program, target, {});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
@@ -544,6 +545,7 @@ TEST(Backward, default_attribute) {
   op->CheckAttrs();
 
   auto target = f::VarDescBind("out");
+  target.SetShape({1});
   AppendBackward(program, target, {});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
@@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) {
   op3->SetOutput("Out", {"out3"});
 
   auto target = f::VarDescBind("out3");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {});
 
@@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) {
   op4->SetOutput("Out", {"out4"});
 
   auto target = f::VarDescBind("out4");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"out3"});
 
@@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) {
   op2->SetOutput("Z", {"z2"});
 
   auto target = f::VarDescBind("z2");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"z1"});
 
@@ -810,6 +815,7 @@ TEST(Backward, shared_var) {
   op3->SetOutput("Out", {"out3"});
 
   auto target = f::VarDescBind("out3");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {});
 
@@ -888,6 +894,7 @@ TEST(Backward, half_backward) {
   op1->SetOutput("Out", {"out"});
 
   auto target = f::VarDescBind("out");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"b"});
   f::OpDescBind *fill_op = block->AllOps()[forward_len];
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index be144d8fc0104fccc08006532a85906ade25c2a1..c54d2d4ddf09c445fb25c1fbe8a7498f233d8212 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -46,6 +46,8 @@ inline std::type_index ToTypeIndex(DataType type) {
       return typeid(int);
     case DataType::INT64:
       return typeid(int64_t);
+    case DataType::BOOL:
+      return typeid(bool);
     default:
       PADDLE_THROW("Not support type %d", type);
   }
@@ -66,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) {
     case DataType::INT64:
       visitor.template operator()<int64_t>();
       break;
+    case DataType::BOOL:
+      visitor.template operator()<bool>();
+      break;
     default:
       PADDLE_THROW("Not supported");
   }
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index adedd8cb0e8504fd6fc924e62a2ede3c1c7ce698..2ffb5b7dbb27b561092856eac0de23d0c3788f75 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -120,7 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(10) << op->DebugString();
+    VLOG(3) << op->DebugString();
     op->Run(*local_scope, *device);
   }
   if (create_local_scope) {
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index a0f2906c749054c1ff9f624e47df432ec2bd6ac8..fdf6de4babff3bb3c253aaf516636882237e6faf 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,6 +13,8 @@
    limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
 
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@@ -27,11 +29,11 @@
 namespace paddle {
 namespace framework {
 
-std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   os << "{";
-  for (auto& v : lod) {
+  for (auto &v : lod) {
     os << "{";
-    for (auto& i : v) {
+    for (auto &i : v) {
       os << i << ",";
     }
     os << "}";
@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
   return os;
 }
 
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
   for (size_t i = level_begin; i < level_end; i++) {
@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   return new_lod;
 }
 
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
   res[0].assign(in[level].begin() + elem_begin,
                 in[level].begin() + elem_end + 1);
   for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto& in_level = in[level + lvl];
-    const auto& above_level = res[lvl - 1];
-    auto& out_level = res[lvl];
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
     out_level.assign(in_level.begin() + above_level.front(),
                      in_level.begin() + above_level.back() + 1);
   }
@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
     // to make the first offset equals 0, all the elements minus the first
     // element
     size_t front = res[lvl].front();
-    for (auto& ele : res[lvl]) {
+    for (auto &ele : res[lvl]) {
       ele -= front;
     }
   }
   return res;
 }
 
-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
   if (in.empty() || in.size() == 1) return in;
   LoD result = in;
   for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto& ele : result[level]) {
+    for (auto &ele : result[level]) {
       ele = result[level + 1][ele];
     }
   }
   return result;
 }
 
-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
   if (a.size() != b.size()) {
     return false;
   }
 
   for (size_t i = 0; i < a.size(); i++) {
-    const auto& a_level = a[i];
-    const auto& b_level = b[i];
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
     if (a_level.size() != b_level.size()) {
       return false;
     }
@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
 }
 
 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                         size_t end_idx, size_t start_level) {
   LoD sub_lod;
 
@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
   return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD* lod, const LoD& lod_length) {
+void AppendLoD(LoD *lod, const LoD &lod_length) {
   PADDLE_ENFORCE(
       lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
     *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
   }
   for (size_t i = 0; i < lod->size(); ++i) {
-    auto& level = (*lod)[i];
+    auto &level = (*lod)[i];
     for (size_t len : lod_length[i]) {
       level.push_back(level.back() + len);
     }
   }
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    framework::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto *pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto *data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto &gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::GPUPlace>(tensor.place()),
+                     reinterpret_cast<const void *>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char *>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  {  // the 4th field, lod information
+     // uint64_t lod_level
+     // uint64_t lod_level_1 size in byte.
+     // int*     lod_level_1 data
+     // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  framework::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void *buf;
+    platform::Place cpu = platform::CPUPlace();
+    switch (desc.data_type()) {
+      case framework::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case framework::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case framework::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case framework::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char *>(buf), tensor->memory_size());
+  }
+  {  // read lod
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 7f8a51cc581e759bc707e506ac7cdeb3680f40ac..9411c96aea4c10ebf921cc3e3b442769c8acbefa 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -24,6 +24,7 @@
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 
@@ -175,9 +176,9 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
   for (size_t ins = 0; ins < num_instances; ins++) {
     for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      tensor.Slice(elem, elem + 1)
-          .CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
-                    platform::CPUDeviceContext());
+      auto slice = tensor.Slice(elem, elem + 1);
+      CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
+               platform::CPUDeviceContext(), &slice);
     }
   }
   return tensor;
@@ -188,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index bf3066983cdcf44ae84f236ac72486e5d4fd5b92..da76052eb4d3067214841af72a35cebb26477e7f 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -26,6 +26,8 @@ namespace framework {
 
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
+const std::string kDropOutOpType = "dropout";
+const std::string kBatchNormOpType = "batch_norm";
 
 bool HasDependentVar(const OpDesc& op_desc,
                      const std::set<std::string>& dependent_vars) {
@@ -106,5 +108,26 @@ void Prune(const ProgramDesc& input, ProgramDesc* output) {
   prune_impl(input, output, 0);
 }
 
+void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
+                             int block_id) {
+  *output = input;
+  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  for (auto& op_desc : *op_field) {
+    if (op_desc.type() == kDropOutOpType ||
+        op_desc.type() == kBatchNormOpType) {
+      for (auto& attr : *op_desc.mutable_attrs()) {
+        if (attr.name() == "is_test") {
+          attr.set_b(true);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output) {
+  inference_optimize_impl(input, output, 0);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/prune.h b/paddle/framework/prune.h
index 8cfb16343aa44dcc8a3349b01adecce33f1c2b5b..23db014894348094a98e043aa744c6f0d27b2640 100644
--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -22,5 +22,7 @@ namespace framework {
 
 void Prune(const ProgramDesc& input, ProgramDesc* output);
 
+void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 28d0fcf94ec31c82476e093f93ccee222a0c9d9a..6a0c5133c9a6bb326ca51755242e75b6eb9e5474 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -89,34 +89,6 @@ class Tensor {
   /*! The internal of two tensors share the same memory block. */
   inline Tensor& ShareDataWith(const Tensor& src);
 
-  /**
-   * @brief   Copy the content of external tensor to a new place.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] dst_place  The dst place.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
-   */
-  // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
-  // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
-  // and make them global functions
-  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
-                       const platform::DeviceContext& ctx);
-
-  /**
-   * @brief   Copy the content of an external vector to a tensor.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * * @note    CopyFromVector assumes that the tensor has been resized
-   *            before invoking.
-   */
-  template <typename T>
-  inline void CopyFromVector(const std::vector<T>& src,
-                             const platform::DeviceContext& ctx);
-
   /**
    * @brief  Return a sub-tensor of the given tensor.
    *
@@ -141,7 +113,6 @@ class Tensor {
 
   size_t memory_size() const;
 
- private:
   inline void check_memory_size() const;
 
  private:
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
deleted file mode 100644
index 0947e33548130a923e998f8bad68db00097af909..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array.cc
+++ /dev/null
@@ -1,444 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-
-
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_array.h"
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <limits>
-
-#include "paddle/framework/eigen.h"
-
-namespace paddle {
-namespace framework {
-
-namespace detail {
-
-/*
- * Offer an iterator over the length-sorted lod-tensor's top level. The top
- * level of a lod-tensor stores batch-size of sequences, each top-level sequence
- * may contains several lower-level sequences, sort top-level lod by the numbers
- * of lower-level sequences in descending order, so that during RNN's running,
- * the batch-size will keep decreasing, the short sentences will end at the tail
- * of each batch.
- *
- * Let's take a simple lod-tensor for example
- *
- *   |(0)       |(1)        top-level has two instances
- *   |||        |||||    lower-level
- *
- * sort by lower-level's length
- *
- *   |(1)       |(0)
- *   |||||      |||
- *
- * when RNN runs, it get 5 batches (equals the number of elements the longest
- * sequence has)
- *
- * |||||
- * |||
- *
- * the first three batches has two elements, the last two elements just has 1
- * element each.
- */
-struct DynamicBatchUnpacker {
-  using value_type = float;
-
-  DynamicBatchUnpacker(const LoDTensor& source, size_t level,
-                       bool descend = true)
-      : source(&source), level(level) {
-    BuildLengthSortedMeta(descend);
-  }
-
-  LoDTensor GetBatch(size_t index);
-
-  std::vector<DySeqMeta> meta;
-
-  LoDTensor const* source;
-  size_t level;
-
- protected:
-  void BuildLengthSortedMeta(bool descend);
-};
-
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level);
-
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) {
-  // collect indice need to copy to the batch
-  std::vector<size_t> indice;
-  for (const auto& seq : meta) {
-    size_t id = seq.begin + batch_id;
-    if (id >= seq.end) break;
-    indice.push_back(id);
-  }
-  return indice;
-}
-
-}  // namespace detail
-
-const LoDTensor& TensorArray::Read(size_t index) const {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-  return values_[index];
-}
-
-void TensorArray::Write(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-
-  values_[index].set_lod(value.lod());
-  values_[index].Resize(value.dims());
-  values_[index].mutable_data<value_type>(value.place());
-  values_[index].CopyFrom(value, value.place(), platform::CPUDeviceContext());
-}
-
-void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-
-  values_[index].set_lod(value.lod());
-  values_[index].ShareDataWith(value);
-}
-
-LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
-                            const LoD& lod) const {
-  return detail::PackDynamicBatch(values_, meta, lod, level);
-}
-
-DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level,
-                                   bool length_desend) {
-  detail::DynamicBatchUnpacker unpacker(source, level,
-                                        length_desend /*descend*/);
-
-  // find max length of all the sequences
-  size_t max_length = 0;
-  for (const auto& seq : unpacker.meta) {
-    max_length = std::max(max_length, seq.end - seq.begin);
-  }
-
-  // write batches to values
-  for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
-    Write(batch_id, unpacker.GetBatch(batch_id));
-  }
-
-  PADDLE_ENFORCE(!unpacker.meta.empty());
-  return unpacker.meta;
-}
-
-LoDTensor TensorArray::LodPack(size_t level) const {
-  PADDLE_ENFORCE_GT(size(), 0UL, "no time step exists");
-  // the levels should be no less than 2
-  LoDTensor merged;
-  const LoDTensor *pre, *cur;
-  pre = &Read(0);
-
-  for (size_t step = 1; step < size(); step++) {
-    cur = &Read(step);
-    PADDLE_ENFORCE_GT(cur->NumLevels(), 0);
-    PADDLE_ENFORCE_GT(pre->NumLevels(), 0);
-    PADDLE_ENFORCE_EQ(pre->NumLevels(), cur->NumLevels());
-    PADDLE_ENFORCE_EQ(pre->NumElements(level), cur->NumElements(level));
-
-    merged = LodPackTwo(*pre, *cur, level);
-    pre = &merged;
-  }
-  return merged;
-}
-
-/*
- * NOTE currently, only the lowest level supports packing.
- * The lowest LoD will be changed, while the relative offsets in levels above
- * stay unchanged.
- *
- * previous step : [0] [1] [3]
- * current step: [0 1 2] [2 3] []
- * packed to
- *   [0 0] [0 1] [0 2] [1 2] [1 3] [3]
- */
-LoDTensor TensorArray::LodPackTwo(const LoDTensor& pre, const LoDTensor& cur,
-                                  size_t level) const {
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), cur.NumLevels());
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), level + 1,
-                    "Only the lowest LoD level supports pack temporarily.");
-  // calculate the result tensor's shape first
-  size_t num_instances = 0;
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-    if (num_candidates > 0) {
-      num_instances += num_candidates * (prefix_size + 1);
-    } else {
-      num_instances += prefix_size;
-    }
-  }
-
-  auto res_dims = pre.dims();
-  res_dims[0] = num_instances;
-  LoDTensor result;
-  result.Resize(res_dims);
-  result.mutable_data<value_type>(cur.place());
-
-  Vector<size_t> last_lod_level;
-  // copy data
-  size_t index = 0;
-  last_lod_level.push_back(index);
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-
-    // slice the prefix Tensor
-    LoDTensor prefix = pre;
-    prefix.ShrinkInLevel(level, elem, elem + 1);
-    LoDTensor candidate = cur;
-    if (num_candidates > 0) {
-      candidate.ShrinkInLevel(level, elem, elem + 1);
-    } else {  // just push prefix
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      last_lod_level.push_back(index);
-    }
-    for (size_t candi = 0; candi < num_candidates; candi++) {
-      // TODO(superjom) support GPU
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      // copy candidate record
-      result.Slice(index, index + 1)
-          .CopyFrom(candidate.Slice(candi, candi + 1), result.place(),
-                    platform::CPUDeviceContext());
-      index++;
-      last_lod_level.push_back(index);
-    }
-  }
-
-  // update lod
-  auto lod = cur.lod();
-  lod.back() = last_lod_level;
-  result.set_lod(lod);
-  return result;
-}
-
-/*
- * source [0 1 2] [3 4] [5 6 7] will be transformd to a list of LoDTensors such
- * as
- * [0 3 5] [1 4 6] [2 7] with 1-level LoDs:
- * - [0 1 2 3]
- * - [0 1 2 3]
- * - [0 1 1 2], the [1,1) here means the second sequence is empty
- *
- * NOTE Unpack a LoDTensor in this approach may result in a big LoD.
- */
-void TensorArray::LodUnpack(const LoDTensor& source, size_t level) {
-  PADDLE_ENFORCE_EQ(level, source.NumLevels() - 1,
-                    "only the lowest LoD level supports unpack.");
-  const size_t non_empty_instances = source.dims()[0];
-  size_t index = 0;
-  Vector<size_t> lowest_lod_level;
-  lowest_lod_level.push_back(index);
-
-  for (size_t step = 0; step < non_empty_instances; step++) {
-    size_t num_instances = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        num_instances++;
-        index++;
-      }
-      lowest_lod_level.push_back(index);
-    }
-
-    // create tensor for this time step
-    LoDTensor tensor;
-    auto dims = source.dims();
-    dims[0] = num_instances;
-    // set lod
-    auto lod = source.lod();
-    lod.back() = lowest_lod_level;
-    tensor.set_lod(lod);
-
-    index = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        // copy this instance
-        tensor.Slice(index, index + 1)
-            .CopyFrom(instance.Slice(step, step + 1), tensor.place(),
-                      platform::CPUDeviceContext());
-        index++;
-      }
-    }
-    Write(step, tensor);
-  }
-}
-
-LoDTensor TensorArray::Stack() const {
-  LoDTensor result;
-  if (size() == 0) return result;
-
-  const auto& first_dims = values_.front().dims();
-  // check all the values have the same shape
-  // TODO(superjom) check the same dtypes
-  for (size_t idx = 1; idx < size(); idx++) {
-    const auto& value_dims = values_[idx].dims();
-    PADDLE_ENFORCE_EQ(first_dims, value_dims);
-  }
-
-  // copy
-  auto result_dims = vectorize(first_dims);
-  result_dims.insert(result_dims.begin(), size());
-  result.Resize(make_ddim(result_dims));
-  result.mutable_data<value_type>(platform::CPUPlace());
-
-  for (size_t idx = 0; idx < size(); idx++) {
-    result.Slice(idx, idx + 1)
-        .CopyFrom(Read(idx), platform::CPUPlace(),
-                  platform::CPUDeviceContext());
-  }
-  return result;
-}
-
-void TensorArray::Unstack(const LoDTensor& source) const {
-  Unstack(source, false /*data_shared*/);
-}
-
-void TensorArray::UnstackShared(const LoDTensor& source) const {
-  Unstack(source, true /*data_shared*/);
-}
-
-void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
-  size_t first_dim = source.dims()[0];
-  DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
-  PADDLE_ENFORCE_GT(first_dim, 0,
-                    "source should have some data to be unstacked");
-
-  values_.resize(first_dim);
-
-  for (size_t elem = 0; elem < first_dim; elem++) {
-    // create a new value
-    auto& value = values_[elem];
-    if (data_shared) {
-      // share memory
-      value.ShareDataWith(source.Slice(elem, elem + 1));
-    } else {
-      // copy
-      value.Resize(value_dims);
-      value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
-                     platform::CPUDeviceContext());
-    }
-  }
-}
-
-size_t TensorArray::size() const { return values_.size(); }
-
-namespace detail {
-
-void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
-  PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
-  // collect meta for each sequence in some level
-  auto lod = SliceLevels(source->lod(), level, level + 1)[0];
-
-  for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
-    DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
-    meta.push_back(seq_meta);
-  }
-
-  PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
-
-  // sort by length
-  sort(meta.begin(), meta.end(),
-       [descend](const DySeqMeta& a, const DySeqMeta& b) {
-         bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
-         return descend ? a_ge_b : !a_ge_b;
-       });
-}
-
-LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
-  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
-  LoDTensor result;
-
-  auto indice = detail::GenDyBatchIndice(meta, index);
-  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
-
-  // copy the indice of records in LoDTensor
-  auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  record_dims_vec.insert(record_dims_vec.begin(), indice.size());
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<value_type>(platform::CPUPlace());
-
-  for (size_t i = 0; i < indice.size(); i++) {
-    auto index = indice[i];
-    auto target = result.Slice(i, i + 1);
-    auto slice = source->Slice(index, index + 1);
-
-    target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
-  }
-
-  return result;
-}
-
-// TODO(supejom) to cache lod if reasonable
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level) {
-  PADDLE_ENFORCE(!source.empty());
-  PADDLE_ENFORCE(!meta.empty());
-  PADDLE_ENFORCE(!lod.empty());
-
-  LoDTensor result;
-
-  // init result space
-  auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  auto height = lod[level].back();
-  record_dims_vec.insert(record_dims_vec.begin(), height);
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<float>(platform::CPUPlace());
-
-  for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
-    for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
-      const auto& seq_meta = meta[seq_id];
-      // source is source[batch_id][seq_id]
-      // target is result[index]
-      auto index = seq_meta.begin + batch_id;
-      if (index >= seq_meta.end) break;
-      auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
-      auto target = result.Slice(index, index + 1);
-      target.CopyFrom(source_, platform::CPUPlace(),
-                      platform::CPUDeviceContext());
-    }
-  }
-
-  result.set_lod(lod);
-  return result;
-}
-
-}  // namespace detail
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
deleted file mode 100644
index 78fad8cab7e27a7f07ca542c2a083460ee9e2b79..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-/*
- * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
- * after lod-tensor's re-assembling, its info can be used to recover the order
- * in original lod-tensor.
- */
-struct DySeqMeta {
-  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
-      : begin(begin), end(end), ori_idx(ori_idx) {}
-
-  size_t begin;
-  size_t end;  // not included
-  size_t ori_idx;
-};
-
-using DySeqMetaBatch = std::vector<DySeqMeta>;
-
-/*
- * Extract the indices of instances.
- */
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id);
-
-/*
- * TensorArray is a C-array-like array of tensors, it is meant to be used with
- * dynamic iteration primitives such as while_loop. It is used to segment inputs
- * and store states in all time steps.
- *
- * By providing some methods similar to a C++ array, the difinition of some
- * state-based dynamic models such as RNN cound be more natural and highly
- * flexible.
- */
-class TensorArray {
- public:
-  using value_type = float;
-
-  // max number of values allowed to store.
-  const size_t MAX_SIZE{100000};
-
-  /*
-   * Read the value at location `index` in the `TensorArray`.
-   */
-  const LoDTensor &Read(size_t index) const;
-
-  /*
-   * Write value into the index of the TensorArray.
-   */
-  void Write(size_t index, const LoDTensor &value);
-
-  /*
-   * Write value into the index of the TensorArray, with memory shared.
-   */
-  void WriteShared(size_t index, const LoDTensor &value);
-
-  /*
-   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
-   * `indice_map`.
-   */
-  LoDTensor Pack(size_t level, const DySeqMetaBatch &meta,
-                 const LoD &lod) const;
-
-  /*
-   * Split LoDTensor in some `level` and write the generated batches to
-   * `values`, if set `desend`, will sort by length in descending order else in
-   * ascending order.
-   */
-  DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend);
-
-  /*
-   * Pack an array of LoDTensors to a LoDTensor.
-   */
-  LoDTensor LodPack(size_t level) const;
-
-  /*
-   * Unpack a LoDTensor to an array of LoDTensors.
-   */
-  void LodUnpack(const LoDTensor &source, size_t level);
-
-  /*
-   * Pack the values into a tensor with rank one higher than each tensor in
-   * values.
-   */
-  LoDTensor Stack() const;
-
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
-   */
-  void Unstack(const LoDTensor &source) const;
-
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
-   * with memory of tensors shared.
-   */
-  void UnstackShared(const LoDTensor &source) const;
-
-  /*
-   * Return the number of values.
-   */
-  size_t size() const;
-
- protected:
-  void Unstack(const LoDTensor &source, bool data_shared) const;
-
-  LoDTensor LodPackTwo(const LoDTensor &pre, const LoDTensor &cur,
-                       size_t level) const;
-
- private:
-  mutable std::vector<LoDTensor> values_;
-};  // class TensorArray
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc
deleted file mode 100644
index 83b52b442daf9b2f1fc40f23e458fcb67c5040e8..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_array.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-
-class TensorArrayTester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    LoDTensor source;
-    source.Resize(make_ddim({batch_size, dim}));
-    int* data = source.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 16 * 32; i++) {
-      data[i] = i;
-    }
-    ta.Unstack(source);
-  }
-
-  TensorArray ta;
-  const int batch_size = 16;
-  const int dim = 32;
-};
-
-TEST_F(TensorArrayTester, Read) {
-  for (int i = 0; i < batch_size; i++) {
-    const auto& tensor = ta.Read(i);
-    ASSERT_EQ(tensor.dims()[0], 1);
-    ASSERT_EQ(tensor.dims()[1], dim);
-  }
-}
-
-TEST_F(TensorArrayTester, Write) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-
-  ta.Write(2, source);
-
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-}
-
-TEST_F(TensorArrayTester, WriteShared) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-
-  ta.WriteShared(2, source);
-
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-
-  EXPECT_EQ(source.data<int>(), tensor.data<int>());
-}
-
-class TensorArrayPackTester : public ::testing::Test {
- protected:
-  virtual void SetUp() override {
-    lod.push_back(std::vector<size_t>{0, 2, 9, 13});
-
-    source.set_lod(lod);
-    source.Resize(make_ddim({13, 128}));
-    source.mutable_data<int>(platform::CPUPlace());
-
-    // content of each setence: 0 1 2 3 4
-    const auto& level = lod.front();
-    for (size_t i = 0; i < level.size() - 1; i++) {
-      size_t begin = level[i];
-      size_t end = level[i + 1];
-      for (size_t j = begin; j < end; j++) {
-        auto record = source.Slice(j, j + 1);
-        for (int dim = 0; dim < 128; dim++) {
-          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
-        }
-      }
-    }
-
-    // unpack
-    meta = ta.Unpack(source, 0, true);
-  }
-
-  LoD lod;
-  TensorArray ta;
-  LoDTensor source;
-  std::vector<DySeqMeta> meta;
-};
-
-TEST_F(TensorArrayPackTester, Unpack) {
-  ASSERT_EQ(ta.size(), 7UL);
-
-  const auto& t0 = ta.Read(0);
-  const auto& t1 = ta.Read(1);
-
-  ASSERT_EQ(t0.data<int>()[0], int(0));
-  ASSERT_EQ(t1.data<int>()[0], int(1));
-}
-
-TEST_F(TensorArrayPackTester, Pack) {
-  LoDTensor packed = ta.Pack(0, meta, lod);
-}
-
-TEST_F(TensorArrayTester, size) {
-  ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
-}
-
-TEST(TensorArray, LodPack) {
-  // three time steps, each step stores a LoDTensors
-  // - [0] [1]
-  // - [2 3], [4 5]
-  // - [6 7] [] [8], [9, 10]
-  // try to get a LoDTensor with content:
-  // - [0 2 6]
-  // - [0 2 7]
-  // - [0 3]
-  // - [1 4 8]
-  // - [1 5 9]
-  // - [1 5 10]
-  std::array<LoDTensor, 3> tensors;
-  tensors[0].Resize(make_ddim({2, 1}));
-  tensors[1].Resize(make_ddim({4, 1}));
-  tensors[2].Resize(make_ddim({5, 1}));
-  int index = 0;
-  for (auto& t : tensors) {
-    t.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < t.dims()[0]; i++) {
-      t.data<int>()[i] = index;
-      index++;
-    }
-  }
-
-  std::array<LoD, 3> lods;
-  std::vector<std::vector<size_t>> levels{
-      {0, 1, 2}, {0, 2, 4}, {0, 2, 2, 3, 5}};
-  for (int i = 0; i < 3; i++) {
-    lods[i].emplace_back(levels[i].begin(), levels[i].end());
-  }
-
-  TensorArray ta;
-  for (int i = 0; i < 3; i++) {
-    tensors[i].set_lod(lods[i]);
-    ta.Write(i, tensors[i]);
-  }
-
-  auto merged = ta.LodPack(0);
-
-  std::vector<int> target_tensor_data{{0, 2, 6,  // 0
-                                       0, 2, 7,  // 1
-                                       0, 3,     // 2
-                                       1, 4, 8,  // 3
-                                       1, 5, 9,  // 5
-                                       1, 5, 10}};
-  EXPECT_EQ(merged.dims()[0], (int)target_tensor_data.size());
-  for (size_t i = 0; i < target_tensor_data.size(); i++) {
-    EXPECT_EQ(target_tensor_data[i], merged.data<int>()[i]);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 7e88e039611007d17156d10f852eb46f3ee8e7a3..aba1f9f09329f890ef190f8820b958c56f017e89 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -150,84 +150,6 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
   return *this;
 }
 
-inline void Tensor::CopyFrom(const Tensor& src,
-                             const platform::Place& dst_place,
-                             const platform::DeviceContext& ctx) {
-  src.check_memory_size();
-  Resize(src.dims());
-
-  auto src_place = src.holder_->place();
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = mutable_data(dst_place, src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&
-           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-template <typename T>
-inline void Tensor::CopyFromVector(const std::vector<T>& src,
-                                   const platform::DeviceContext& ctx) {
-  auto dst_place = ctx.GetPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-
-  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
-                 src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(dst_place)) {
-    memory::Copy(
-        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
 inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
   check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0,
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 1bb0fb71b079940d35a995b78e04a531c074a8b2..ceca64365a1a628642eb374a3e3bbdff490c955a 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -188,178 +188,6 @@ TEST(Tensor, Slice) {
 #endif
 }
 
-TEST(Tensor, CopyFrom) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    CPUDeviceContext cpu_ctx((CPUPlace()));
-
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
-
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-    dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor src_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    // CPU Tensor to GPU Tensor
-    auto gpu_place = new paddle::platform::GPUPlace(0);
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
-
-    // GPU Tensor to CPU Tensor
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-
-    // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
-
-    // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Slice Tensors
-    gpu_ctx.Wait();
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#endif
-}
-
-TEST(Tensor, CopyFromVector) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-
-    // Compare Tensors
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    cpu_ptr = cpu_tensor.data<int>();
-    src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    delete cpu_place;
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-
-    // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::GPUPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    // Copy from GPU to CPU tensor for comparison
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* src_ptr = src_vec.data();
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    src_ptr = src_vec.data();
-    cpu_ptr = cpu_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    delete cpu_place;
-    delete gpu_place;
-  }
-#endif
-}
-
 TEST(Tensor, ReshapeToMatrix) {
   using namespace paddle::framework;
   using namespace paddle::platform;
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e34b90d57eed8fea84b83045df61a98483c8849
--- /dev/null
+++ b/paddle/framework/tensor_util.h
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief   Copy the content of external tensor to a new place.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+ */
+
+inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
+                     const platform::DeviceContext& ctx, Tensor* dst) {
+  src.check_memory_size();
+
+  dst->Resize(src.dims());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  auto size = src.numel() * SizeOfType(src.type());
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief   Copy the content of an external vector to a tensor.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src,
+                           const platform::DeviceContext& ctx, Tensor* dst) {
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief   Copy the content of a tensor to a vector
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                         std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03a70de182d0eb499a81413d38229c81c4378b91
--- /dev/null
+++ b/paddle/framework/tensor_util_test.cc
@@ -0,0 +1,228 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/tensor_util.h"
+#include <gtest/gtest.h>
+#include <string>
+
+namespace paddle {
+namespace framework {
+TEST(CopyFrom, Tensor) {
+  Tensor src_tensor;
+  Tensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+
+  int* src_ptr =
+      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+
+  auto cpu_place = new platform::CPUPlace();
+  CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  CopyFrom(slice_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr =
+        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new platform::GPUPlace(0);
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new platform::CPUPlace();
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    CopyFrom(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Slice Tensors
+    gpu_ctx.Wait();
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#endif
+}
+
+TEST(CopyFromVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    delete cpu_place;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::GPUPlace();
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    // Copy from GPU to CPU tensor for comparison
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+
+TEST(CopyToVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = i;
+    }
+
+    CPUPlace place;
+    CPUDeviceContext cpu_ctx(place);
+    std::vector<int> dst;
+    CopyToVector<int>(src, cpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor gpu_tensor;
+    GPUPlace place;
+    CUDADeviceContext gpu_ctx(place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<int> dst;
+    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 91d732641a4a5eed050841b59fd10da397eb732f..41ead3c5ecef248830cfb0f8be360f21dcd58e7b 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
     list(REMOVE_ITEM GSERVER_SOURCES
          dataproviders/DataProvider.cpp
          dataproviders/MultiDataProvider.cpp
-         dataproviders/ProtoDataProvider.cpp
          dataproviders/PyDataProvider2.cpp
          dataproviders/PyDataProvider.cpp)
 
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 8b7b2e9b65898950e036ebc023cd28990cef303f..f5a41b66bf09a4abc5ae7b64f227ca52461408f5 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
+
 /**
  * @brief Relu Activation.
  * forward. y = max(0, z)
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 0478256f9cd81f4a99eb0cbcbd1a5a21de5cf14b..106cf5b6228e636026ded558d0f591022f1ae586 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <algorithm>
-#include "ProtoDataProvider.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
 
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
 REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
-REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
 
 int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
   int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
deleted file mode 100644
index c6f5cab1915b7f41d505c37a7fef762a392bad7f..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ /dev/null
@@ -1,932 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ProtoDataProvider.h"
-#include <algorithm>
-#include <fstream>
-#include <istream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "DataProviderGroup.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_double(memory_threshold_on_load_data,
-              1.0,
-              "stop loading data when memory is not sufficient");
-
-namespace paddle {
-
-REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
-REGISTER_DATA_PROVIDER(proto_sequence_group,
-                       DataProviderGroup<ProtoSequenceDataProvider>);
-
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
-                                     bool useGpu,
-                                     bool loadDataAll)
-    : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
-  if (loadDataAll) {
-    loadData(config_.files());
-  }
-}
-
-void ProtoDataProvider::loadData(const std::vector<std::string>& fileList) {
-  for (auto& file : fileList) {
-    if (FLAGS_memory_threshold_on_load_data < 1.0) {
-      double memUsage = getMemoryUsage();
-      if (memUsage > FLAGS_memory_threshold_on_load_data) {
-        LOG(INFO) << "memUsage is " << memUsage << ", > "
-                  << FLAGS_memory_threshold_on_load_data
-                  << " therefore SKIP ALL REMAINING file.";
-        break;
-      }
-    }
-    LOG(INFO) << "load data file " << file;
-    loadDataFile(file);
-  }
-
-  if (sequenceStartPositions_.size() == sampleNums_) {
-    // This means that each sample is one sequence
-    shuffledSequenceIds_.swap(sequenceStartPositions_);
-  } else {
-    sequenceStartPositions_.push_back(sampleNums_);
-    shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1);
-    for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) {
-      shuffledSequenceIds_.push_back(i);
-    }
-  }
-
-  LOG(INFO) << "read done, num of instance=" << sampleNums_;
-  showDataStats();
-}
-
-void ProtoDataProvider::loadData(const std::string& fileName) {
-  std::vector<std::string> fileList;
-  loadFileList(fileName, fileList);
-  loadData(fileList);
-}
-
-void ProtoDataProvider::checkDataHeader(const DataHeader& header) {
-  if (header_.slot_defs_size()) {
-    // header_ is already set. Need to check consistency.
-    CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size())
-        << "Different header";
-    for (int i = 0; i < header.slot_defs_size(); ++i) {
-      CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type());
-      CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim());
-    }
-    return;
-  }
-
-  // header_ is not set before
-  CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined";
-  int i;
-  for (i = 0; i < header.slot_defs_size(); ++i) {
-    if (header.slot_defs(i).type() == SlotDef::INDEX ||
-        header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) {
-      break;
-    }
-    constexpr int kBufLen = 100;
-    char buf[kBufLen];
-    snprintf(buf, kBufLen, "slot%d_nnz", i);
-    nnzStats_.push_back(getStat(buf));
-  }
-  numVecSlots_ = i;
-
-  // Check that INDEX slots are after VECTOR slots
-  for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) {
-    CHECK(header.slot_defs(i).type() == SlotDef::INDEX ||
-          header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX);
-  }
-
-  slots_.clear();
-  slots_.reserve(header.slot_defs_size());
-  for (int i = 0; i < header.slot_defs_size(); ++i) {
-    slots_.emplace_back();
-    slots_.back().type = header.slot_defs(i).type();
-    slots_.back().dim = header.slot_defs(i).dim();
-    if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() ||
-        SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) {
-      slots_.back().indices.push_back(0);
-    }
-  }
-
-  header_ = header;
-}
-
-void ProtoDataProvider::checkSample(const DataSample& sample) {
-  CHECK_EQ(numVecSlots_, sample.vector_slots_size());
-  CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() ||
-        header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size());
-  for (int i = 0; i < numVecSlots_; ++i) {
-    uint32_t dim = header_.slot_defs(i).dim();
-    switch (header_.slot_defs(i).type()) {
-      case SlotDef::VECTOR_DENSE: {
-        CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          CHECK_EQ(0, sample.vector_slots(i).values_size());
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).values_size());
-        CHECK_GE(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(sample.vector_slots(i).values_size(),
-                 sample.vector_slots(i).ids_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        if (static_cast<int>(dim) != 0) {
-          CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-          if (sample.vector_slots(i).dims_size() != 0) {
-            int totalDim = sample.vector_slots(i).dims(0);
-            for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-              totalDim *= sample.vector_slots(i).dims(j);
-            }
-            CHECK_EQ(static_cast<int>(dim), totalDim);
-          }
-        } else {
-          CHECK_NE(sample.vector_slots(i).dims_size(), 0);
-          int totalDim = sample.vector_slots(i).dims(0);
-          for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-            totalDim *= sample.vector_slots(i).dims(j);
-          }
-          CHECK_EQ(totalDim, sample.vector_slots(i).values_size());
-        }
-        break;
-      }
-      case SlotDef::STRING: {
-        CHECK_EQ(static_cast<int>(1), sample.vector_slots(i).strs_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        break;
-      }
-      default:
-        LOG(FATAL) << "BUG: Should not reach here";
-    }
-  }
-  for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) {
-    if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) {
-      uint32_t id = sample.id_slots(i - numVecSlots_);
-      if (id == -1U) continue;
-      CHECK_LT(id, header_.slot_defs(i).dim());
-    } else {
-      for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size();
-           ++j) {
-        uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j);
-        CHECK_LT(id, header_.slot_defs(i).dim());
-      }
-    }
-  }
-}
-
-void ProtoDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  bool dataCompression = str::endsWith(fileName, ".gz");
-  std::unique_ptr<ProtoReader> reader(new ProtoReader(&is, dataCompression));
-  CHECK(reader) << "Fail to create proto data input stream";
-
-  DataHeader header;
-  CHECK(reader->read(&header));
-  checkDataHeader(header);
-
-  DataSample sample;
-  do {
-    if (!reader->read(&sample)) {
-      break;
-    }
-    checkSample(sample);
-    if (sample.is_beginning()) {
-      sequenceStartPositions_.push_back(sampleNums_);
-    }
-    fillSlots(sample);
-    ++sampleNums_;
-  } while (true);
-
-  CHECK(is.eof()) << "Fail to read file";
-  reader.reset(nullptr);
-  is.close();
-}
-
-// checkSample has done before, no check here
-void ProtoDataProvider::fillSlots(const DataSample& sample) {
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    int dim = slot.dim;
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE: {
-        size_t oldSize = slot.denseData.size();
-        slot.denseData.resize(oldSize + dim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + dim, slot.denseData.begin() + oldSize);
-#else
-        memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim);
-#endif
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        int slotSize = sample.vector_slots(i).ids_size();
-        int subSlotSize = 0;
-        int id = 0;  // the slot id
-        // find whether this vector_slots has subseq. If not has subseq,
-        // subSlotSize = 0.
-        for (id = 0; id < sample.subseq_slots_size(); id++) {
-          if (sample.subseq_slots(id).slot_id() == i) {
-            subSlotSize = sample.subseq_slots(id).lens_size();
-            break;
-          }
-        }
-        if (subSlotSize && slot.subIndices.size() == 0UL) {
-          // If has subSeq, the first element of subIndices = 0.
-          slot.subIndices.push_back(0);
-        }
-        if (slotSize == 0UL) {
-          // if has no id, new indices = old indices.
-          slot.indices.push_back(slot.indices.back());
-          // if has subSeq, new subIndices = old subIndices.
-          if (slot.subIndices.size()) {
-            slot.subIndices.push_back(slot.subIndices.back());
-          }
-          break;
-        }
-        slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
-               ids,
-               sizeof(*ids) * slotSize);
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        if (subSlotSize) {
-          for (int ii = 0; ii < subSlotSize; ++ii) {
-            slot.subIndices.push_back(slot.subIndices.back() +
-                                      sample.subseq_slots(id).lens(ii));
-          }
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          slot.indices.push_back(slot.indices.back());
-          break;
-        }
-        int slotSize = sample.vector_slots(i).ids_size();
-        slot.sparseFloatValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        const float* values = sample.vector_slots(i).values().data();
-        for (int ii = 0; ii < slotSize; ++ii) {
-          slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii];
-          slot.sparseFloatValueData[slot.indices.back() + ii].value =
-              values[ii];
-        }
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        break;
-      }
-      case SlotDef::INDEX: {
-        slot.indexData.push_back(sample.id_slots(i - numVecSlots_));
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        size_t oldSize = slot.varDenseData.size();
-        slot.varDenseData.resize(oldSize + 1);
-        size_t varDim = sample.vector_slots(i).values_size();
-        slot.varDenseData[oldSize].data.resize(varDim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(
-            values, values + varDim, slot.varDenseData[oldSize].data.data());
-#else
-        memcpy(slot.varDenseData[oldSize].data.data(),
-               values,
-               sizeof(real) * varDim);
-#endif
-        slot.varDenseData[oldSize].dims.resize(
-            sample.vector_slots(i).dims_size());
-        memcpy(slot.varDenseData[oldSize].dims.data(),
-               sample.vector_slots(i).dims().data(),
-               sizeof(uint32_t) * sample.vector_slots(i).dims_size());
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        size_t oldSize = slot.varIndices.size();
-        slot.varIndices.resize(oldSize + 1);
-        size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size();
-        slot.varIndices[oldSize].resize(varDim);
-        memcpy(slot.varIndices[oldSize].data(),
-               sample.var_id_slots(i - numVecSlots_).ids().data(),
-               sizeof(uint32_t) * varDim);
-        break;
-      }
-      case SlotDef::STRING: {
-        slot.strData.push_back(sample.vector_slots(i).strs(0));
-        break;
-      }
-    }
-  }
-}
-
-void ProtoDataProvider::showDataStats() {
-  std::ostringstream oss;
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) {
-      size_t nnz = slot.sparseNonValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) {
-      size_t nnz = slot.sparseFloatValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    }
-  }
-  LOG(INFO) << oss.str();
-}
-
-void ProtoDataProvider::reset() {
-  currentSequenceIndex_ = 0;
-  if (!skipShuffle_) {
-    shuffle();
-  }
-
-  DataProvider::reset();
-}
-
-void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(),
-               shuffledSequenceIds_.end(),
-               ThreadLocalRandomEngine::get());
-}
-
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sequence ranging from [begin, end),
-  op(begin, end) will be called.
-
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) {
-  int64_t sz = 0;
-  size_t i;
-  size_t sequenceCount = shuffledSequenceIds_.size();
-  if (usageRatio_ < 1.0f) {
-    sequenceCount = static_cast<int64_t>(sequenceCount * usageRatio_);
-  }
-  for (i = currentSequenceIndex_; i < sequenceCount; ++i) {
-    size_t id = shuffledSequenceIds_[i];
-    int64_t begin = sequenceStartPositions_[id];
-    int64_t end = sequenceStartPositions_[id + 1];
-    int64_t len = end - begin;
-    if (sz + len > size && sz > 0) break;
-    sz += len;
-    op(begin, end);
-  }
-  return i - currentSequenceIndex_;
-}
-
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sequence at position
-  pos, op(pos) will be called.
-
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) {
-  if (iidData()) {
-    size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-    for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-         ++i) {
-      size_t pos = shuffledSequenceIds_[i];
-      op(pos);
-    }
-    return size;
-  } else {
-    auto f = [op](int64_t begin, int64_t end) {
-      for (int64_t pos = begin; pos < end; ++pos) {
-        op(pos);
-      }
-    };
-    return sequenceLoop(f, size);
-  }
-}
-
-/*
-  Loop through sub-sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sub-sequence at position
-  pos, op(pos) will be called.
-
-  return the number of sub-sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) {
-  CHECK(iidData()) << "subSampleLoop only accepts iid data";
-  size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-  int subSize = 0;
-  for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-       ++i) {
-    size_t pos = shuffledSequenceIds_[i];
-    int64_t* indexs = slots_[slot].indices.data();
-    int64_t* subIndexs = slots_[slot].subIndices.data();
-    int64_t subSeqStart = 0;
-    int64_t subSeqEnd = 0;
-    for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) {
-      if (subIndexs[j] == indexs[pos]) {
-        subSeqStart = j;
-        if (subIndexs[pos] == subIndexs[pos + 1]) {
-          subSeqEnd = j + 1;
-          break;
-        }
-      } else if (subIndexs[j] == indexs[pos + 1]) {
-        subSeqEnd = j;
-        break;
-      }
-    }
-    for (int j = subSeqStart; j < subSeqEnd; j++) {
-      op(j);
-    }
-    subSize += subSeqEnd - subSeqStart;
-  }
-  return subSize;
-}
-
-int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  if (iidData()) {
-    size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-    numScannedSeqs = numSequences = size;
-  } else {
-    int64_t sz = 0;
-    auto op = [&sz, &numSequences](int64_t begin, int64_t end) {
-      ++numSequences;
-      sz += end - begin;
-    };
-    numScannedSeqs = sequenceLoop(op, size);
-    VLOG_IF(1, numScannedSeqs > numSequences)
-        << numScannedSeqs - numSequences
-        << " sequences are skipped because longer than " << size;
-    size = sz;
-  }
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-
-  if (!iidData()) {
-    ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                  numSequences + 1,
-                                  /* useGpu= */ false);
-    int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
-    int pos = 0;
-    int i = 0;
-    auto op = [buf, &pos, &i](int64_t begin, int64_t end) {
-      buf[i] = pos;
-      pos += end - begin;
-      ++i;
-    };
-    sequenceLoop(op, size);
-    buf[i] = size;
-    for (size_t slot = 1; slot < cpuArguments.size(); ++slot) {
-      cpuArguments[slot].sequenceStartPositions =
-          cpuArguments[0].sequenceStartPositions;
-    }
-  }
-
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    size_t dim = header_.slot_defs(slot).dim();
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         NO_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        size_t numElements = 0;
-        for (auto pos : dataPos) {
-          numElements +=
-              slots_[slot].indices[pos + 1] - slots_[slot].indices[pos];
-        }
-        nnzStats_[slot]->addSample(numElements);
-
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         FLOAT_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        CHECK_EQ(size, 1);
-        auto mat = cpuArguments[slot].value;
-        size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size();
-
-        CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3));
-        size_t height, width, depth, oldWidth;
-        /* dims[2] is depth, will be changed to dims[0] in future */
-        depth = slots_[slot].varDenseData[dataPos[0]].dims[2];
-        height = slots_[slot].varDenseData[dataPos[0]].dims[1];
-        width = slots_[slot].varDenseData[dataPos[0]].dims[0];
-        oldWidth = width;
-        /* process the undesirable sample */
-        if (oldWidth < height) {
-          width = height;
-        }
-        cpuArguments[slot].setFrameHeight(height);
-        cpuArguments[slot].setFrameWidth(width);
-
-        if (oldWidth < height) {
-          totalDim = width * height * depth;
-        }
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               totalDim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        cpuArguments[slot].value->zeroMem();
-        if (oldWidth < height) {
-          real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data();
-          for (size_t i = 0; i < depth; i++) {
-            for (size_t j = 0; j < height; j++) {
-              for (size_t k = 0; k < oldWidth; k++) {
-                buf[i * height * width + j * width + k] =
-                    srcBuf[i * height * oldWidth + j * oldWidth + k];
-              }
-            }
-          }
-        } else {
-          memcpy(buf,
-                 slots_[slot].varDenseData[dataPos[0]].data.data(),
-                 sizeof(real) * totalDim);
-        }
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        bufStarts[1] = 1;
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        CHECK_EQ(size, 1);
-        size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalDim,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf,
-               slots_[slot].varIndices[dataPos[0]].data(),
-               sizeof(int) * totalDim);
-
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        /* we expand the convolutinal feature map to a sequence data,
-         * so there should be a corresponding sequence labels */
-        bufStarts[1] = totalDim;
-        break;
-      }
-      case SlotDef::STRING: {
-        if (cpuArguments[slot].strs) {
-          cpuArguments[slot].strs->resize(size);
-        } else {
-          cpuArguments[slot].strs =
-              std::make_shared<std::vector<std::string>>(size);
-        }
-        for (int i = 0; i < size; ++i) {
-          (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]];
-        }
-        break;
-      }
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (int i = 0; i < header_.slot_defs_size(); ++i) {
-      SlotDef::SlotType slotType = header_.slot_defs(i).type();
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  currentSequenceIndex_ += numScannedSeqs;
-
-  return batch->getSize();
-}
-
-ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config,
-                                                     bool useGpu,
-                                                     bool loadDataAll)
-    : ProtoDataProvider(config, useGpu, loadDataAll) {}
-
-int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
-                                                        DataBatch* batch) {
-  CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data";
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-  numScannedSeqs = numSequences = size;
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-
-    // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                  size + 1,
-                                  /* useGpu= */ false);
-
-    switch (slotType) {
-      case SlotDef::VECTOR_SPARSE_VALUE:
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "ProtoSequenceDataProvider only support"
-                   << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots";
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // copy to IDS, not value
-        // pointers used in current slot
-        sparse_non_value_t* data = slots_[slot].sparseNonValueData.data();
-        int64_t* indexs = slots_[slot].indices.data();
-        int64_t* seqs = dataPos.data();
-
-        // current slot: i need size instances. what is the total length?
-        int totalFeatureInCurrentSlot = 0;
-        for (int ins = 0; ins < size; ins++) {
-          int64_t currInsId = seqs[ins];
-          totalFeatureInCurrentSlot +=
-              indexs[currInsId + 1] - indexs[currInsId];
-          // special: if current instance has NO feature in current slot
-          if (indexs[currInsId + 1] == indexs[currInsId]) {
-            totalFeatureInCurrentSlot++;
-          }
-        }
-        // done
-
-        // current slot: ids
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalFeatureInCurrentSlot,
-                                /* useGpu= */ false);
-
-        // where to write
-        int* currPosOfArgumentId = cpuArguments[slot].ids->getData();
-        int* currPosOfArgumentSeqStart =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        int allSequenceLength = 0;
-        currPosOfArgumentSeqStart[0] = 0;
-        // for each instance, copy data and fill sequence positions
-        for (int instance = 0; instance < size; instance++) {
-          int64_t currInstanceId = seqs[instance];
-          int64_t currInstanceLength =
-              indexs[currInstanceId + 1] - indexs[currInstanceId];
-          sparse_non_value_t* currInstanceData = data + indexs[currInstanceId];
-          // write sequenceStartPositions
-          allSequenceLength += currInstanceLength;
-          currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-          // copy features
-          for (int featCopier = 0; featCopier < currInstanceLength;
-               featCopier++) {
-            currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col;
-          }
-          currPosOfArgumentId += currInstanceLength;
-          // special: if current instance has NO feature in current slot
-          if (currInstanceLength == 0) {
-            allSequenceLength++;
-            currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-            currPosOfArgumentId[0] = -1;
-            currPosOfArgumentId++;
-          }
-          // done
-        }
-        if (slots_[slot].subIndices.size()) {
-          std::vector<int64_t> dataSubPos;
-          auto op = [this, &dataSubPos](int64_t pos) {
-            dataSubPos.push_back(pos);
-          };
-          int subSize = subSampleLoop(op, size, slot);
-          ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
-          int* currPosOfArgumentSubSeqStart =
-              cpuArguments[slot].subSequenceStartPositions->getMutableData(
-                  false);
-          int64_t* subSeqs = dataSubPos.data();
-          int64_t* subIndexs = slots_[slot].subIndices.data();
-          int allSubSequenceLength = 0;
-          currPosOfArgumentSubSeqStart[0] = 0;
-          // for each instance, compute sub-sequence number
-          for (int instance = 0; instance < subSize; instance++) {
-            int64_t currSubInstanceId = subSeqs[instance];
-            int64_t currSubInstanceLength =
-                subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId];
-            // write subSequenceStartPositions
-            allSubSequenceLength += currSubInstanceLength;
-            currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            // special: if current instance has NO feature in current slot
-            if (currSubInstanceLength == 0) {
-              allSubSequenceLength++;
-              currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            }
-          }
-          cpuArguments[slot].checkSubset();
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /* useGpu= */ false);
-        // fill labels
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        // label HAS sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // copy values
-        size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        // sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(
-          cpuArguments[i], useGpu_, HPPL_STREAM_1);
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  currentSequenceIndex_ += numScannedSeqs;
-  return batch->getSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
deleted file mode 100644
index 7dd45e062248f20d24c633dd4e1c8b7eebcbfa1b..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "DataFormat.pb.h"
-#include "paddle/utils/Stat.h"
-
-#include "DataProvider.h"
-#include "ProtoReader.h"
-
-namespace paddle {
-
-/**
- * @brief Provider data from protobuf data file with each sample
- * specified by proto message
- *
- * DataSample defined in DataFormat.proto.
- *
- * The file format is
- *
- *    header
- *
- *    sample1
- *
- *    sample2
- *
- *    ...
- *
- *    sampleN
- *
- * @note: In the data file, each message is prefixed with its length.
- * The read/write of the protbuf are implemented in ProtoReader.h
- */
-class ProtoDataProvider : public DataProvider {
-public:
-  ProtoDataProvider(const DataConfig& config,
-                    bool useGpu,
-                    bool loadDataAll = true);
-  virtual void reset();
-
-  /**
-   * @note this size includes the sequences which are skipped because they
-   * are longer than the batch size.
-   */
-  virtual int64_t getSize() {
-    int64_t size = sampleNums_;
-    if (usageRatio_ < 1.0f) {
-      size = static_cast<int64_t>(size * usageRatio_);
-    }
-    return size;
-  }
-  virtual void shuffle();
-
-  void loadData(const std::vector<std::string>& fileList);
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-protected:
-  /**
-   * @brief load protobuf data from a list of file
-   * @param[in]  fileName  file name of a file which contains
-   * a list of file names
-   */
-  void loadData(const std::string& fileName);
-
-  /**
-   * @brief load protobuf data from file
-   * @param[in]  fileName   data file name
-   */
-  void loadDataFile(const std::string& fileName);
-  /** @brief check data header of each data sample
-   *  @param[in] header     data header read from protobuf data
-   */
-  void checkDataHeader(const DataHeader& header);
-  /**
-   * @brief fill protobuf data into slot_,
-   * slot_ is a vector of ProtoSlot in memory.
-   * @param[in]  sample     data sample read from protobuf data
-   */
-  void fillSlots(const DataSample& sample);
-
-  /**
-   * @brief return true if each sample is one sequence, i.e., independent
-   * of other samples.
-   */
-  inline bool iidData() const { return sequenceStartPositions_.empty(); }
-
-  /**
-   * @brief check that sample is consistent with header_
-   */
-  void checkSample(const DataSample& sample);
-
-  template <class Op>
-  int64_t sequenceLoop(Op op, int64_t size);
-
-  template <class Op>
-  int64_t sampleLoop(Op op, int64_t size);
-
-  template <class Op>
-  int64_t subSampleLoop(Op op, int64_t size, int slot);
-
-  void showDataStats();
-
-protected:
-  struct ProtoVarSlot {
-    std::vector<real> data;
-    std::vector<int> dims;
-  };
-
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    std::vector<int> indexData;
-    std::vector<real> denseData;
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    std::vector<int64_t> indices;
-    std::vector<int64_t> subIndices;
-
-    std::vector<ProtoVarSlot> varDenseData;
-    std::vector<std::vector<int>> varIndices;
-    std::vector<std::string> strData;
-  };
-  DataHeader header_;
-  int numVecSlots_;
-
-  std::vector<ProtoSlot> slots_;
-  size_t sampleNums_;
-
-  /**
-   * The starting position of each sequence in samples.
-   * The last element should be num of samples.
-   * If empty, each sample is one sequence.
-   */
-  std::vector<size_t> sequenceStartPositions_;
-
-  int64_t currentSequenceIndex_;
-
-  // The size should be the number of sequences.
-  std::vector<size_t> shuffledSequenceIds_;
-
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-
-  RWLock lock_;
-  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
-};
-
-/**
- * @brief Special use for Proto data: instances should contain sparse-non-value
- * slots
- * and label.
- *
- * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
- */
-class ProtoSequenceDataProvider : public ProtoDataProvider {
-public:
-  ProtoSequenceDataProvider(const DataConfig& config,
-                            bool useGpu,
-                            bool loadDataAll = true);
-  ~ProtoSequenceDataProvider() {}
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index bc7d1c83a48aefeb4bc6d3baa32b78aba712e58d..925af31289d0c8ca534a30a16b14bfd2df90b013 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
 
   weight_.reset(new Weight(1, channels_, parameters_[0]));
   movingMean_.reset(new Weight(1, channels_, parameters_[1]));
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index e721d2d267a31cae46407673b8b1281e87055608..2ac3cd9d670d0fcf9c40ad2f117d5a72479663a3 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -94,6 +94,8 @@ protected:
   bool useGlobalStats_;
   // use to compute moving mean and variance.
   real movingAvgFraction_;
+  // Epsilon is a small random noise used in batch normalization for stability.
+  real epsilon_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index dacff25e5927daf9c991577a71be86b160228317..25ab5cd927792d18f78bc1fa33eee4029b427cc7 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -22,8 +22,6 @@ namespace paddle {
 
 REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
 
-const real BatchNormalizationLayer::EPS = 1E-5;
-
 bool BatchNormalizationLayer::init(const LayerMap& layerMap,
                                    const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
@@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
 
   calMovingMeanAndVar();
 
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
   savedInvVar_->sqrt2(*savedInvVar_);
 }
 
@@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
   savedInvVar_->copyFrom(*(movingVar_->getW()));
   savedInvVar_->downClip(real(0.0));
 
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
   savedInvVar_->sqrt2(*savedInvVar_);
 }
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index f6115801fc6b341c0718f8851617de43bdeeec09..1fdb5e2070259a14ab6f70957c9cf03f0699f734 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -39,9 +39,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /// Epsilon value used in the batch normalization formula.
-  static const real EPS;
-
   /// Load pre-calculated mean and std.
   void setMeanAndStd();
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 49a9540c0b6e36b59ed786287ff5c4569b69a6a5..8390b55026c895b661cb514714ba92c05a7bf02e 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 
 REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
 
-const double CudnnBatchNormLayer::EPS = 1E-5;
-
 bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
                                const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
@@ -61,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) {
   real* movingMean = movingMean_->getW()->getData();
   real* movingVar = movingVar_->getW()->getData();
 
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
   if (!useGlobalStats_) {
     REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
     real* savedMean = savedMean_->getData();
@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    1.0 - movingAvgFraction_,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   eps_,
                                    savedMean,
                                    savedInvVar);
   } else {
@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                       beta,
                                       movingMean,
                                       movingVar,
-                                      EPS);
+                                      eps_);
     } else {
       // There is a limitation in cudnn library.
       // When the batch size is larger than 1024 in cuDNN v5.1,
@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    beta,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   eps_,
                                    batchSize,
                                    channels_,
                                    imageH_ * imageD_,
@@ -128,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
   real* savedMean = savedMean_->getData();
   real* savedInvVar = savedInvVar_->getData();
 
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
   auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
     Matrix::resizeOrCreate(m, h, w, false, true);
     m->zeroMem();
@@ -157,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
                          gamma,
                          gammaGrad,
                          betaGrad,
-                         EPS,
+                         eps_,
                          savedMean,
                          savedInvVar);
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index 413efd4d3ecd734b343efbcf8328ac0592daddda..1a3f0c0cbf8a1540e77cef70c753c91298728484 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cudnn.h>
 #include "BatchNormBaseLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
@@ -46,12 +47,9 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /**
-   * Epsilon value used in the batch normalization formula.
-   * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
-   * Same epsilon value should be used in forward and backward functions.
-   */
-  static const double EPS;
+  /// Epsilon value used in the batch normalization formula.
+  /// Same epsilon value should be used in forward and backward functions.
+  double eps_;
 
   /// Input/output tensor descriptor desc
   hl_tensor_descriptor ioDesc_;
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e2dbe3c3c416f606d2938701f26288642b55267
--- /dev/null
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+
+class DotProdLayer : public Layer {
+public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DotProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(dot_prod, DotProdLayer);
+
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+
+  return true;
+}
+
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be26b9ba88c279036f73b0a0baaff164755fe067
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..df20a49934d5dd444f127842c8fdb7c77f4ebeb1
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index d62a8d846e5b347aa44ce1951c043d5813a5b3ff..236f8096bdb6e024cf3c9c73eba422616a777a23 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
   Matrix::resizeOrCreate(preOutput_.grad,
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
-
+                         false);
   IVectorPtr label = getInput(*getLabelLayer()).ids;
-
   preOutput_.value->zeroMem();
 
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
   /* add the bias-vector */
   if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
     preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
   preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
-                                 *output_.value,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
   preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
   preOutput_.grad->one();
   preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *label, *biases_->getWGrad());
-
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the W-gradient for the current layer */
     MatrixPtr input = getInputValue(i);
     if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
       preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the input layers error */
     MatrixPtr inputGrad = getInputGrad(i);
     if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
       preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
     }
   }
 }
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 9afd40b1674680da962d6e51caa56b46279b70de..7f896e61ca26e3e22b99b65b1285384a121f7f02 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
   int codeLength_;
   /// temporary result of output_
   Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.cpp b/paddle/gserver/layers/L2DistanceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c71df1b92cef9b19001a0984953a260fbdd1d762
--- /dev/null
+++ b/paddle/gserver/layers/L2DistanceLayer.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "L2DistanceLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(l2_distance, L2DistanceLayer);
+
+bool L2DistanceLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
+                                     << "only two inputs.";
+  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
+                           << "is fixed to be 1.";
+
+  return true;
+}
+
+void L2DistanceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const auto inV1 = getInputValue(0);
+  const auto inV2 = getInputValue(1);
+
+  CHECK(inV1 && inV2);
+  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
+      << "The height of two inputs of this layer must be the same.";
+  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
+      << "The width of two inputs of this layer must be the same.";
+
+  int batchSize = inV1->getHeight();
+  int output_dim = getSize();
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+    reserveOutput(batchSize, output_dim);
+    auto outV = getOutputValue();
+    CHECK(outV) << "The output matrix should not be null.";
+
+    Matrix::resizeOrCreate(
+        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
+
+    inputSub_->assign(*inV1);
+    inputSub_->sub(*inV2);
+    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
+    outV->sqrt2(*outV);
+  }
+}
+
+void L2DistanceLayer::backward(const UpdateCallback& callback) {
+  const auto outG = getOutputGrad();
+  const auto outV = getOutputValue();
+  CHECK(outG && outV);
+
+  auto inGrad1 = getInputGrad(0);
+  auto inGrad2 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+
+    if (inGrad1 || inGrad2) {
+      outV->scalarDiv(*outV, 1.);
+      outV->dotMul(*outG, *outV);
+    }
+
+    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
+
+    if (inGrad2) {
+      inputSub_->mulScalar(-1.);
+      inGrad2->addRowScale(0, *inputSub_, *outV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b12847a10e64a713635c0df079507b23a73c257
--- /dev/null
+++ b/paddle/gserver/layers/L2DistanceLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief The layer calculates the l2 distance between two input vectors.
+ * \f[
+ * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim)
+ * - Input2: A vector (batchSize * dataDim)
+ * - Output: A vector (batchSize * 1)
+ *
+ * The configuration api is: l2_distance_layer.
+ */
+
+class L2DistanceLayer : public Layer {
+public:
+  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
+  ~L2DistanceLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  // Store the result of subtracting Input2 from Input1 in forward computation,
+  // which will be reused in backward computation.
+  MatrixPtr inputSub_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
index 0f2b67fd758ec1513f42c4cb1a36f2f3915f4740..39bffc26f7ddcd159130c492115b41080e32ce7f 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -38,12 +38,13 @@ bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
 }
 
 void MKLDNNAddtoLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
   reshapeInput(bs, ih, iw);
   ic = inputLayers_[0]->getSize() / ih / iw;
   CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
     CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
@@ -57,47 +58,43 @@ void MKLDNNAddtoLayer::reshape(
 }
 
 void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& wgt,
-                                MKLDNNMatrixPtr& bias,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
                                 MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inVals_, bias, out);
-  in = inVals_[0];
+  resetFwdBuffers(inputs, biasVal_, out);
 
   std::shared_ptr<sum::primitive_desc> fwdPD;
   std::shared_ptr<sum::primitive_desc> biasPD;
-  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
 }
 
 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& wgt,
-                                MKLDNNMatrixPtr& bias,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
                                 MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, bias, out);
-  in = inGrads_[0];
+  resetBwdBuffers(inputs, biasGrad_, out);
 
   // backward only need share output grad to input grad
-  for (size_t i = 0; i < inGrads_.size(); i++) {
-    if (inGrads_[i] != nullptr) {
-      inGrads_[i] = out;
-      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i] != nullptr) {
+      inputs[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
     }
   }
 
   // backward bias
   bwdBias_ = nullptr;
-  if (bias) {
+  if (biasGrad_) {
     std::vector<float> scales(bs_, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
-    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+    std::vector<memory::primitive_desc> srcPDs(bs_,
+                                               biasGrad_->getPrimitiveDesc());
+    auto biasPD =
+        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
     std::vector<primitive::at> srcs;
     for (size_t i = 0; i < grads_.size(); ++i) {
       srcs.push_back(*(grads_[i]));
     }
-    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
     pipeline.push_back(*bwdBias_);
   }
 }
@@ -208,7 +205,7 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
 
   inputs.resize(inputLayers_.size());
   for (size_t i = 0; i < inputs.size(); i++) {
-    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
     CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
   }
 
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index 24504b7b4f50726e2b2757ca3029461cdc27b411..0ea3e208e5fab8cbed8b53390a9381e6f2bb5733 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -26,9 +26,6 @@ namespace paddle {
  */
 class MKLDNNAddtoLayer : public MKLDNNLayer {
 protected:
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
-
   // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
   size_t layerSize_;
 
@@ -50,52 +47,19 @@ public:
             const ParameterMap& parameterMap) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
 
-  void printValueFormat() override {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
-    }
-    if (extOutVal_) {
-      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormat() override {
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
-    }
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
-    }
-  }
-
 protected:
-  /**
-   * Forward functions: reset buffers(inputs, output, bias),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
@@ -110,17 +74,10 @@ protected:
                         std::vector<MKLDNNMatrixPtr>& inputs,
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(inputs, output, bias)
-   */
   void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
 
-  /**
-   * prepare for bias
-   */
   void prepareBias(MKLDNNMatrixPtr& bias,
                    const MatrixPtr& biasMat,
                    const MKLDNNMatrixPtr& out,
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
index 071bdf54d5dc9538d5ced580a73b9c0fbcea41fb..7faca0f8b7f54fa0a09e8fdab11064c8c26df375 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 
 REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
 
-const real MKLDNNBatchNormLayer::EPS = 1E-5;
-
 bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
                                 const ParameterMap& parameterMap) {
   if (!MKLDNNLayer::init(layerMap, parameterMap)) {
@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
+
   VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
                     << " --- global stats";
   VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
@@ -116,21 +116,20 @@ void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
 }
 
 void MKLDNNBatchNormLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
   oh = ih;
   ow = iw;
   // ic_ and oc can not be changed
-  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
       << "Input channel can not be changed";
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
 }
 
 void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
                                     MKLDNNMatrixPtr& out) {
   // In training phase, it will always calculate mean and var,
   // so useGlobalStats must be false.
@@ -140,25 +139,23 @@ void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
     useGlobalStats_ = false;
   }
 
-  resetFwdBuffers(in, wgt, out);
+  resetFwdBuffers(inputs[0], wgtVal_, out);
 
-  resetFwdPD(fwdPD_, in, wgt, out);
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
 }
 
 void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
                                     MKLDNNMatrixPtr& out) {
   std::shared_ptr<bn_bwd::primitive_desc> pd;
 
-  resetBwdBuffers(in, wgt, out);
+  resetBwdBuffers(inputs[0], wgtGrad_, out);
 
-  resetBwdPD(pd, in, wgt, out);
+  resetBwdPD(pd, inputs[0], wgtGrad_, out);
 
-  resetBwdPipeline(pipeline, pd, in, wgt, out);
+  resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
 }
 
 void MKLDNNBatchNormLayer::forward(PassType passType) {
@@ -213,7 +210,7 @@ void MKLDNNBatchNormLayer::resetFwdPD(
   if (wgt) {
     flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
   }
-  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
+  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
   pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
   CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
   if (wgt) {
@@ -260,9 +257,9 @@ void MKLDNNBatchNormLayer::resetFwdPipeline(
 void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                            MKLDNNMatrixPtr& wgt,
                                            MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
   if (gradScaleShift_) {
     CHECK(wgtVal_);
     resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
@@ -280,7 +277,7 @@ void MKLDNNBatchNormLayer::resetBwdPD(
   }
   CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
   auto md = in->getMemoryDesc();
-  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
+  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
   pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
   CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
   CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
@@ -297,11 +294,12 @@ void MKLDNNBatchNormLayer::resetBwdPipeline(
   if (pd == nullptr) {
     return;
   }
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
   bwdData_.reset(
       wgt && wgtVal_
-          ? new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *wgtVal_, *in, *wgt)
-          : new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *in));
+          ? new bn_bwd(
+                *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
+          : new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
   pipeline.push_back(*bwdData_);
 }
 
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
index 456c0424ecb8dde17f98a900c5d77268cc672e34..1cf33cb34fa9cd7c9b8487a0a4a0011fb129e311 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -32,7 +32,8 @@ protected:
   std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
 
   // Epsilon value used in the batch normalization formula.
-  static const real EPS;
+  real epsilon_;
+
   // weight and bias in paddle
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
@@ -73,18 +74,14 @@ public:
   void forward(PassType passType) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
@@ -98,11 +95,7 @@ protected:
    * moving = moving * AvgFraction + local * (1 - AvgFraction)
    */
   void calMovingMeanAndVar();
-  /**
-   * Forward functions: reset buffers(input, weight, output),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
+
   void resetFwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& out);
@@ -115,12 +108,6 @@ protected:
                         MKLDNNMatrixPtr& in,
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, weight, output),
-   *                     reset primitive descriptor,
-   *                     reset pipeline.
-   */
   void resetBwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& out);
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44bb0883b89c712d70e2d4fdfe16bdfde86f81b7
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConcatLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
+
+bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_GT(inputLayers_.size(), 1UL);
+  CHECK(!biasParameter_);
+  return true;
+}
+
+void MKLDNNConcatLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
+  CHECK_GT(inputLayers_.size(), 1UL);
+  channels_.resize(inputLayers_.size());
+  channels_[0] = ic;
+  oc = ic;
+  for (size_t i = 1; i < inputLayers_.size(); i++) {
+    int batchsize, height, witdh;
+    reshapeInput(batchsize, height, witdh, i);
+    CHECK_EQ(bs, batchsize);
+    CHECK_EQ(ih, height);
+    CHECK_EQ(iw, witdh);
+
+    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
+    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
+    oc += channels_[i];
+  }
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
+                                 std::vector<MKLDNNMatrixPtr>& inputs,
+                                 MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs, out);
+
+  std::shared_ptr<concat::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inputs, out);
+
+  resetFwdPipeline(pipeline, fwdPD, inputs, out);
+}
+
+void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
+                                 std::vector<MKLDNNMatrixPtr>& inputs,
+                                 MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inputs, out);
+
+  resetBwdPipeline(pipeline, bwds_, inputs, out);
+}
+
+void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  bool has8c = false, has16c = false, hasnc = false;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i, channels_[i]);
+    CHECK(inputs[i]);
+    auto dm = inputs[i]->getDims();
+    // inputs format can be different, but ndims must equal
+    CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
+    CHECK_EQ(bs_, dm[0]);
+    CHECK_EQ(channels_[i], dm[1]);
+    if (dm.size() > 2) {
+      CHECK_EQ(ih_, dm[2]);
+      CHECK_EQ(iw_, dm[3]);
+    }
+    if (inputs[i]->getFormat() == format::nc) {
+      hasnc = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw8c) {
+      has8c = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw16c) {
+      has16c = true;
+    }
+  }
+
+  format outFmt;
+  if (has16c && oc_ % 16 == 0) {
+    outFmt = format::nChw16c;
+  } else if (has8c && oc_ % 8 == 0) {
+    outFmt = format::nChw8c;
+  } else if (hasnc) {
+    CHECK(oh_ == 1 && ow_ == 1);
+    outFmt = format::nc;
+  } else {
+    outFmt = format::nchw;
+  }
+  memory::dims outDims =
+      hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
+  auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
+  resetOutValue(out, outPD);
+}
+
+void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
+                                   std::vector<MKLDNNMatrixPtr>& inputs,
+                                   MKLDNNMatrixPtr out) {
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+
+void MKLDNNConcatLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<concat::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new concat(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    CHECK(inVals_[i]);
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
+  }
+}
+
+void MKLDNNConcatLayer::resetBwdPipeline(
+    std::vector<mkldnn::primitive>& pipeline,
+    std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  // reset the backward primitives
+  memory::dims offsets = {0, 0, 0, 0};
+  prims.resize(inputs.size());
+  CHECK_EQ(inputs.size(), channels_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto viewPD = view::primitive_desc(
+        out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
+    auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
+                                         inputs[i]->getPrimitiveDesc());
+    prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
+    offsets[axis_] += channels_[i];
+    // push to pipeline
+    pipeline.push_back(*prims[i]);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..37f3a26c5ed5db10cdba507368874c9557fb75ef
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Concatenate layer.
+ *
+ * The config file api is mkldnn_concat
+ */
+class MKLDNNConcatLayer : public MKLDNNLayer {
+protected:
+  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
+  // input channel numbers
+  std::vector<int> channels_;
+
+  // concat_dimension in MKLDNN
+  // if axis_ == 0, concat batchsize
+  // if axis_ == 1, concat channel (default)
+  int axis_;
+
+public:
+  explicit MKLDNNConcatLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), axis_(1) {}
+
+  ~MKLDNNConcatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void printSizeInfo() override {
+    CHECK_EQ(channels_.size(), inputLayers_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
+                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
+                         << ", " << iw_;
+    }
+    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
+                       << ", " << ow_;
+  }
+
+  size_t keepCondition() {
+    // reset when the total element size of all inputs changed
+    size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
+    for (size_t i = 1; i < inputLayers_.size(); ++i) {
+      totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
+    }
+    return totalSize;
+  }
+
+protected:
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 8aa54e0a9efa7adb766cbb6009f6a29410c6ae7d..ab1d0f7b049a349c00c6e23deb37d789382de64f 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -90,7 +90,7 @@ void MKLDNNConvLayer::convertWeightsToPaddle() {
 }
 
 void MKLDNNConvLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
 
   // cal output sizes
@@ -105,21 +105,17 @@ void MKLDNNConvLayer::reshape(
 }
 
 void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
   resetFwdPD(fwdPD_);
 
-  resetFwdBuffers(fwdPD_, in, wgt, bias, out);
+  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 }
 
 void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
   std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
   std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
@@ -128,9 +124,10 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
 
   resetBwdDataPD(bwdDataPD);
 
-  resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
 
-  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
 }
 
 void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
@@ -236,14 +233,14 @@ void MKLDNNConvLayer::resetBwdWgtPD(
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
 
   // create backward weight using input, output and weight value memory desc
-  CHECK(inVal_) << "Should have internal input value";
+  CHECK(inVals_[0]) << "Should have internal input value";
   CHECK(outVal_) << "Should have internal output value";
   CHECK(wgtVal_) << "Should have weight value";
   algorithm algo = algorithm::convolution_direct;
   padding_kind padKind = padding_kind::zero;
   auto bwdWgtDesc = biasVal_ != nullptr
                         ? conv_bwdWgt::desc(algo,
-                                            inVal_->getMemoryDesc(),
+                                            inVals_[0]->getMemoryDesc(),
                                             wgtVal_->getMemoryDesc(),
                                             biasVal_->getMemoryDesc(),
                                             outVal_->getMemoryDesc(),
@@ -252,7 +249,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
                                             padR,
                                             padKind)
                         : conv_bwdWgt::desc(algo,
-                                            inVal_->getMemoryDesc(),
+                                            inVals_[0]->getMemoryDesc(),
                                             wgtVal_->getMemoryDesc(),
                                             outVal_->getMemoryDesc(),
                                             strides,
@@ -260,7 +257,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
                                             padR,
                                             padKind);
   pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, pd->src_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
   CHECK_PRIMITIVE_DESC_EQ(
       outVal_,
       pd->diff_dst_primitive_desc(),
@@ -280,12 +277,12 @@ void MKLDNNConvLayer::resetBwdDataPD(
 
   memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVal_) << "Should have internal input value";
+  CHECK(inVals_[0]) << "Should have internal input value";
   CHECK(outVal_) << "Should have internal output value";
   // create backward data using input and output value memory desc
   // but using weight memory desc with any format
   auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
-                                        inVal_->getMemoryDesc(),
+                                        inVals_[0]->getMemoryDesc(),
                                         MKLDNNMatrix::createMemoryDesc(wgtDims),
                                         outVal_->getMemoryDesc(),
                                         strides,
@@ -294,7 +291,7 @@ void MKLDNNConvLayer::resetBwdDataPD(
                                         padding_kind::zero);
   pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
   CHECK_PRIMITIVE_DESC_EQ(
-      inVal_,
+      inVals_[0],
       pd->diff_src_primitive_desc(),
       "primitive desc of in value and grad should be equal");
   CHECK_PRIMITIVE_DESC_EQ(
@@ -346,12 +343,12 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
   // add bwdWgt handle
   if (bias) {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
   } else {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt));
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
   }
   pipeline.push_back(*bwdWgt_);
 
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index 9c69136684e5f9005860b476ec6ed1bbc9ceff6c..3e754a0e65771879e836c13d63d5a5c8be3a699a 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -69,18 +69,14 @@ public:
             const ParameterMap& parameterMap) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
@@ -107,48 +103,26 @@ protected:
                         mkldnn::memory::dims& padL,
                         mkldnn::memory::dims& padR);
 
-  /**
-   * reset the forward primitive descriptor.
-   */
   void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
-  /**
-   * reset the MKLDNNMatrix buffers used in forward.
-   */
   void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                        MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  /**
-   * reset the forward pipeline.
-   */
   void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                         std::shared_ptr<conv_fwd::primitive_desc>& pd,
                         MKLDNNMatrixPtr& in,
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * reset the backward weight primitive descriptor.
-   */
   void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
-  /**
-   * reset the backward data primitive descriptor.
-   */
   void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
-  /**
-   * reset the MKLDNNMatrix buffers used in backward.
-   */
   void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
                        MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  /**
-   * reset the backward pipeline.
-   */
   void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                         std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                         std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 350ec65fffbc73c3a6e4245f763f4c6aa868f574..c8778bdd077c4b6d170140be92bdcdd7e8e81bb2 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -74,7 +74,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
 }
 
 void MKLDNNFcLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
 
   CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
@@ -87,32 +87,29 @@ void MKLDNNFcLayer::reshape(
 }
 
 void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
-                             MKLDNNMatrixPtr& in,
-                             MKLDNNMatrixPtr& wgt,
-                             MKLDNNMatrixPtr& bias,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
                              MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(in, wgt, bias, out);
+  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
 
-  resetFwdPD(fwdPD_, in, wgt, bias, out);
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 }
 
 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
-                             MKLDNNMatrixPtr& in,
-                             MKLDNNMatrixPtr& wgt,
-                             MKLDNNMatrixPtr& bias,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
                              MKLDNNMatrixPtr& out) {
   std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
   std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
 
-  resetBwdBuffers(in, wgt, bias, out);
+  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
 
-  resetBwdWgtPD(bwdWgtPD, wgt, bias, out);
+  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
 
-  resetBwdDataPD(bwdDataPD, in, out);
+  resetBwdDataPD(bwdDataPD, inputs[0], out);
 
-  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
 }
 
 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@@ -193,9 +190,9 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& wgt,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
 
   CHECK(wgtVal_);
   resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
@@ -212,14 +209,15 @@ void MKLDNNFcLayer::resetBwdWgtPD(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
-  fc_bwdWgt::desc bwdWgtDesc = bias ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                      wgt->getMemoryDesc(),
-                                                      bias->getMemoryDesc(),
-                                                      out->getMemoryDesc())
-                                    : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                      wgt->getMemoryDesc(),
-                                                      out->getMemoryDesc());
+  CHECK(inVals_[0]);
+  fc_bwdWgt::desc bwdWgtDesc =
+      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             bias->getMemoryDesc(),
+                             out->getMemoryDesc())
+           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             out->getMemoryDesc());
   pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
 }
 
@@ -245,11 +243,11 @@ void MKLDNNFcLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
   if (bias) {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
   } else {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt));
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
   }
   pipeline.push_back(*bwdWgt_);
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index ee861763ff3dc10ddb4c119358b80dbe1614aecb..283dc9b540531f6009ae6e2485b7c12d4e5cf2e3 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -52,18 +52,14 @@ public:
             const ParameterMap& parameterMap) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
@@ -73,11 +69,6 @@ public:
   void convertWeightsToPaddle() override;
 
 protected:
-  /**
-   * Forward functions: reset buffers(input, output, weight and bias),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
   void resetFwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
@@ -93,13 +84,6 @@ protected:
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, output, weight and bias),
-   *                     reset primitive descriptor for backward weight,
-   *                     reset primitive descriptor for backward data,
-   *                     reset pipeline.
-   */
   void resetBwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index e75ac5ba4647a8267b7bc189893bd7adb5c3053f..6fbf3c7fdec2f537769adb660c67c5a597beb609 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -21,8 +21,8 @@ namespace paddle {
 
 bool MKLDNNLayer::init(const LayerMap& layerMap,
                        const ParameterMap& parameterMap) {
-  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                          << "Please set WITH_MKLDNN=ON "
+  CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
+                          << "Please set WITH_MKL=ON "
                           << "and set use_mkldnn=True";
   CHECK(!useGpu_) << "Do not support GPU yet";
 
@@ -48,31 +48,20 @@ void MKLDNNLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
     CHECK(!inputLayers_.empty());
     copySeqInfoToOutputs();
-    size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
-    if (inputElemenCnt_ != elemenCnt) {
+    if (condition_ != keepCondition()) {
       VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-      // reset when input total sizes changed, not only the batchsize
-      inputElemenCnt_ = elemenCnt;
-      pipelineFwd_.clear();
+      condition_ = keepCondition();
       reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-      // all cpu device output grad or value share output's
+      printSizeInfo();
+      // the output_.value and output_.grad are shared with CPU device
       shareCPUDevice();
-      resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-      // MKLDNNLayer output value should be MKLDNNMatrix
-      // so external output value is necessary.
-      // Then external input value is not necessary,
-      // since input may be mkldnn internal buffer.
-      CHECK(extOutVal_) << "external output value is necessary";
-      output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-      CHECK(inVal_ && outVal_) << "internal memories are necessary";
-      if (cvtInVal_) {
-        pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
-      }
-      if (cvtOutVal_) {
-        pipelineFwd_.push_back(*cvtOutVal_);
-      }
+      pipelineFwd_.clear();
+      inVals_.resize(inputLayers_.size(), nullptr);
+      extInVals_.resize(inputLayers_.size(), nullptr);
+      cvtInVals_.resize(inputLayers_.size(), nullptr);
+      resetFwd(pipelineFwd_, inVals_, outVal_);
+      prepareValueConversions(pipelineFwd_);
       convertWeightsFromPaddle();
-      printSizeInfo();
       printValueFormat();
       needResetBwd_ = true;
     }
@@ -80,8 +69,8 @@ void MKLDNNLayer::forward(PassType passType) {
     if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
       // Update input value data when input layer is "data" type,
       // since the input value data address might be changed.
-      CHECK(extInVal_);
-      extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+      CHECK(extInVals_[0]);
+      extInVals_[0]->setData(getInputValue(0, CPU_DEVICE)->getData());
     }
 
     if (!outputOnlyMKLDNN_) {
@@ -99,22 +88,13 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
   if (needResetBwd_) {
     VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
     pipelineBwd_.clear();
+    inGrads_.resize(inputLayers_.size(), nullptr);
+    extInGrads_.resize(inputLayers_.size(), nullptr);
+    cvtInGrads_.resize(inputLayers_.size(), nullptr);
     pipelineMergeGrad_.clear();
     mergeGrad_ = nullptr;
-    resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
-    // external output grad is not necessary
-    // since output may be mkldnn internal buffer or merge them directly.
-    CHECK(outGrad_) << "internal output grad is necessary";
-    if (extOutGrad_) {
-      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
-          << "the external buffer should share the same data with output_.grad";
-    }
-    if (cvtOutGrad_) {
-      pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
-    }
-    if (cvtInGrad_) {
-      pipelineBwd_.push_back(*cvtInGrad_);
-    }
+    resetBwd(pipelineBwd_, inGrads_, outGrad_);
+    prepareGradConversions(pipelineBwd_);
     printGradFormat();
     needResetBwd_ = false;
   }
@@ -138,8 +118,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
   }
 }
 
-void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
-  const Argument& input = inputLayers_[0]->getOutput();
+void MKLDNNLayer::reshapeInput(int& batchsize,
+                               int& height,
+                               int& width,
+                               size_t idx) {
+  const Argument& input = inputLayers_[idx]->getOutput();
   batchsize = input.getBatchSize();
   int h = input.getFrameHeight();
   int w = input.getFrameWidth();
@@ -173,27 +156,30 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
 void MKLDNNLayer::resetInValue(
     MKLDNNMatrixPtr& in,
     const std::shared_ptr<memory::primitive_desc>& intPD,
-    size_t inputIdx) {
-  cvtInVal_ = nullptr;
-  extInVal_ = nullptr;
+    size_t idx,
+    int inputChannel) {
+  cvtInVals_[idx] = nullptr;
+  extInVals_[idx] = nullptr;
   in = nullptr;
-  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+  inputChannel = inputChannel == 0 ? ic_ : inputChannel;
+  CHECK_GT(bs_ * inputChannel * ih_ * iw_, 0);
   auto extPD = MKLDNNMatrix::createPrimitiveDesc(
-      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
-  extInVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), extInVal_ != nullptr);
-  if (extInVal_ == nullptr || extInVal_->getFormat() == format::nc) {
-    extInVal_ = MKLDNNMatrix::create(extPD, inMat);
+      {bs_, inputChannel, ih_, iw_}, format::nchw, engine_);
+  const MatrixPtr& inMat = inputLayers_[idx]->getOutputValue();
+  extInVals_[idx] = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), extInVals_[idx] != nullptr);
+  if (extInVals_[idx] == nullptr ||
+      extInVals_[idx]->getFormat() == format::nc) {
+    extInVals_[idx] = MKLDNNMatrix::create(extPD, inMat);
   }
-  in = extInVal_;
+  in = extInVals_[idx];
   if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
     return;
   }
   // need create reorder
   in = MKLDNNMatrix::create(*intPD);
-  cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
-  CHECK(cvtInVal_) << "should not be emptry";
+  cvtInVals_[idx] = MKLDNNMatrix::createReorder(extInVals_[idx], in);
+  CHECK(cvtInVals_[idx]) << "should not be emptry";
 }
 
 void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
@@ -215,11 +201,11 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
 
 void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
                               memory::primitive_desc intPD,
-                              size_t inputIdx) {
-  cvtInGrad_ = nullptr;
-  extInGrad_ = nullptr;
+                              size_t idx) {
+  cvtInGrads_[idx] = nullptr;
+  extInGrads_[idx] = nullptr;
   in = nullptr;
-  LayerPtr& input = inputLayers_[inputIdx];
+  LayerPtr& input = inputLayers_[idx];
   if (input->getOutputGrad() == nullptr) {
     // no need input grad
     return;
@@ -234,23 +220,25 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
   in = MKLDNNMatrix::create(intPD, inMat);
   Argument& arg = input->getOutput(this->getName());
   arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
   if (inputIsOnlyMKLDNN()) {
     return;
   }
 
-  extInGrad_ = in;
-  if (isPaddleFormat(extInGrad_->getFormat())) {
+  extInGrads_[idx] = in;
+  if (isPaddleFormat(extInGrads_[idx]->getFormat())) {
     return;
   }
   // need create reorder
-  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+  CHECK(extInVals_[idx] != nullptr &&
+        isPaddleFormat(extInVals_[idx]->getFormat()))
       << "should have external input value and the format must be nchw(nc)";
-  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
+  extInGrads_[idx] =
+      MKLDNNMatrix::create(extInVals_[idx]->getPrimitiveDesc(), inMat);
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
   in = MKLDNNMatrix::create(intPD);
-  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
-  CHECK(cvtInGrad_);
+  cvtInGrads_[idx] = MKLDNNMatrix::createReorder(in, extInGrads_[idx]);
+  CHECK(cvtInGrads_[idx]);
 }
 
 void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
@@ -306,22 +294,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
     srcs.push_back(*src);
   }
 
-  // TODO(TJ): remove me when mkldnn sum support different formats
-  for (size_t i = 1; i < srcPDs.size(); ++i) {
-    CHECK(srcPDs[0] == srcPDs[i]);
-  }
-  tmpOutGrad_ = out;
-  tmpCvt_ = nullptr;
-  if (out->getPrimitiveDesc() != srcPDs[0]) {
-    tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
-    tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
-    CHECK(tmpCvt_);
-    pipelineMergeGrad_.push_back(*tmpCvt_);
-  }
-
-  auto sumPD =
-      sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
-  mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
+  auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *out));
   pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
 }
 
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 7479c34c92b5231b2521493bc631474d4efd4224..e48b9b5a91f7f17cb3f31e9140f1428ba8954a20 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -34,15 +34,16 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
  */
 class MKLDNNLayer : public Layer {
 protected:
-  // input value element count
-  size_t inputElemenCnt_;
   // batch size
   int bs_;
+  // their sizes are always from the first input layer
   // input image channel, height and width
   int ic_, ih_, iw_;
   // output image channel, height and width
   int oc_, oh_, ow_;
 
+  // the condition that forward need be reset
+  size_t condition_;
   // backward also need reset after reset forward handle
   bool needResetBwd_;
 
@@ -67,18 +68,18 @@ protected:
    * When all layers are mkldnn layers, they could save internal data.
    */
   // below MKLDNNMatrix buffers are all internal buffers
-  MKLDNNMatrixPtr inVal_;
-  MKLDNNMatrixPtr inGrad_;
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
   MKLDNNMatrixPtr outVal_;
   MKLDNNMatrixPtr outGrad_;
   // below are external value and grad
-  MKLDNNMatrixPtr extInVal_;
-  MKLDNNMatrixPtr extInGrad_;
+  std::vector<MKLDNNMatrixPtr> extInVals_;
+  std::vector<MKLDNNMatrixPtr> extInGrads_;
   MKLDNNMatrixPtr extOutVal_;
   MKLDNNMatrixPtr extOutGrad_;
   // convert handle between external and internal buffers
-  std::shared_ptr<mkldnn::reorder> cvtInVal_;
-  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
   std::shared_ptr<mkldnn::reorder> cvtOutVal_;
   std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
 
@@ -93,23 +94,11 @@ protected:
   std::vector<mkldnn::primitive> pipelineMergeGrad_;
   // tmp input argument to save input grad, only used to merge grad
   Argument tmpInArg_;
-  // since mkldnn sum do not support different formats:
-  // can refer to https://github.com/01org/mkl-dnn/issues/134
-  // so need create reorder manually and save tmp MKLDNNMatrix
-  MKLDNNMatrixPtr tmpOutGrad_;
-  std::shared_ptr<mkldnn::primitive> tmpCvt_;
 
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
-        inputElemenCnt_(0),
-        bs_(0),
-        ic_(0),
-        ih_(0),
-        iw_(0),
-        oc_(0),
-        oh_(0),
-        ow_(0),
+        condition_(0),
         needResetBwd_(true),
         outputOnlyMKLDNN_(false),
         engine_(mkldnn::engine::cpu, 0),
@@ -125,31 +114,28 @@ public:
   virtual void backward(const UpdateCallback& callback);
 
   /**
-   * reshape the input image sizes
-   * and reset output image and buffer size
-   * output channel can not be changed
+   * reshape the input and output channels and image sizes
+   * and reset output buffer size
    */
   virtual void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
 
   /**
    * reset the mkldnn forward primitve and memories
    * only would be called when input size changes
+   * weight and bias buffers should be coverd by child class itself
    */
   virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
                         MKLDNNMatrixPtr& out) = 0;
 
   /**
    * reset the mkldnn backward primitve and memories
    * only would be called when needed
+   * weight and bias buffers should be coverd by child class itself
    */
   virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
                         MKLDNNMatrixPtr& out) = 0;
 
   /**
@@ -175,10 +161,19 @@ public:
   void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
 
 protected:
+  /**
+   * Some layers may have different condition to reset the forward.
+   * The function returns the condition that do not need reset forward.
+   */
+  inline virtual size_t keepCondition() {
+    // reset when the first input element size changed, not only the batchsize
+    return inputLayers_[0]->getOutputValue()->getElementCnt();
+  }
+
   /**
    * reshape the input image sizes and input batchsize
    */
-  void reshapeInput(int& batchsize, int& height, int& width);
+  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
 
   /**
    * reshape output image sizes
@@ -196,11 +191,13 @@ protected:
   /**
    * reset input value from input MKLDNNMatrix and internal primitive desc.
    * reset both internal and external buffer and create reorder if necessary.
+   * input channel may be different in concat.
    */
   void resetInValue(
       MKLDNNMatrixPtr& in,
       const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
-      size_t inputIdx = 0);
+      size_t idx = 0,
+      int inputChannel = 0);
 
   /**
    * reset output value from internal primitive desc.
@@ -215,7 +212,7 @@ protected:
    */
   void resetInGrad(MKLDNNMatrixPtr& in,
                    mkldnn::memory::primitive_desc intPD,
-                   size_t inputIdx = 0);
+                   size_t idx = 0);
 
   /**
    * reset output grad from internal primitive desc.
@@ -293,17 +290,19 @@ protected:
    * print the mkldnn memory format of value
    */
   virtual void printValueFormat() {
-    if (extInVal_) {
-      VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
-    }
-    if (inVal_) {
-      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      if (!inVals_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
+                                                  : inVals_[i]->getFormat())
+                        << " >>> " << inVals_[i]->getFormat() << " >>>";
     }
     if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
-    }
-    if (extOutVal_) {
-      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
+                        << (extOutVal_ ? extOutVal_->getFormat()
+                                       : outVal_->getFormat());
     }
     if (wgtVal_) {
       VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
@@ -317,17 +316,19 @@ protected:
    * print the mkldnn memory format of grad
    */
   virtual void printGradFormat() {
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
-    }
     if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
+                        << (extOutGrad_ ? extOutGrad_->getFormat()
+                                        : outGrad_->getFormat());
     }
-    if (inGrad_) {
-      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
-    }
-    if (extInGrad_) {
-      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      if (!inGrads_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
+                                                   : inGrads_[i]->getFormat())
+                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
     }
     if (wgtGrad_) {
       VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
@@ -434,6 +435,41 @@ private:
       outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
     }
   }
+
+  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // MKLDNNLayer output value should be MKLDNNMatrix
+    // so external output value is necessary.
+    // Then external input value is not necessary,
+    // since input may be mkldnn internal buffer.
+    CHECK(extOutVal_) << "external output value is necessary";
+    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
+    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
+      if (cvtInVals_[i]) {
+        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
+      }
+    }
+    if (cvtOutVal_) {
+      pipeline.push_back(*cvtOutVal_);
+    }
+  }
+  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
+    }
+    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
+      if (cvtInGrads_[i]) {
+        pipeline.push_back(*cvtInGrads_[i]);
+      }
+    }
+  }
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index a18c455beab96ef25b5545281bae4d48cec98d9e..a8252593c8fbb8013ab909e74a057850ba54bcaa 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -58,10 +58,11 @@ bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
 }
 
 void MKLDNNPoolLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
   // ic_ and oc can not be changed
-  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
       << "Input channel can not be changed";
 
   // cal output sizes
@@ -74,29 +75,25 @@ void MKLDNNPoolLayer::reshape(
 }
 
 void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(in, out);
+  resetFwdBuffers(inputs[0], out);
 
-  resetFwdPD(fwdPD_, in, out);
+  resetFwdPD(fwdPD_, inputs[0], out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
 }
 
 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
   std::shared_ptr<pool_bwd::primitive_desc> pd;
 
-  resetBwdBuffers(in, out);
+  resetBwdBuffers(inputs[0], out);
 
-  resetBwdPD(pd, in, out);
+  resetBwdPD(pd, inputs[0], out);
 
-  resetBwdPipeline(pipeline, pd, in, out);
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
 }
 
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
@@ -151,9 +148,9 @@ void MKLDNNPoolLayer::resetFwdPipeline(
 
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index c5ec87828bfb28b4502b4ec6b47287089c514204..dad60156f0ef7caa059ff6c70d1040e7e34c938f 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -53,18 +53,14 @@ public:
             const ParameterMap& parameterMap) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void printSizeInfo() override {
@@ -75,11 +71,6 @@ public:
   }
 
 protected:
-  /**
-   * Forward functions: reset buffers(input, output),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
@@ -88,12 +79,6 @@ protected:
                         std::shared_ptr<pool_fwd::primitive_desc>& pd,
                         MKLDNNMatrixPtr& in,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, output),
-   *                     reset primitive descriptor,
-   *                     reset pipeline.
-   */
   void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr& in,
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
index 02402894d3354a6af221948a3360ef830881bf39..2c8256b91c97b513ce7237b8174c522430094926 100644
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ROIPoolLayer.h"
+#include <cfloat>
 
 namespace paddle {
 
@@ -126,10 +127,8 @@ void ROIPoolLayer::forward(PassType passType) {
 
           bool isEmpty = (hend <= hstart) || (wend <= wstart);
           size_t poolIndex = ph * pooledWidth_ + pw;
-          if (isEmpty) {
-            outputData[poolIndex] = 0;
-            argmaxData[poolIndex] = -1;
-          }
+          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
+          argmaxData[poolIndex] = -1;
 
           for (size_t h = hstart; h < hend; ++h) {
             for (size_t w = wstart; w < wend; ++w) {
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 4bea348f637f39444e8aad89278e6366ecd73b1d..c295ea19c9ccb3d05c509a41925d2c36efdba8ef 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
 gserver_test(test_MaxPoolingWithMaskOutput)
 
-########## test_Mkldnn layers and activations ##########
+########## test_MKLDNN layers and activations ##########
 if(WITH_MKLDNN)
     add_unittest_without_exec(test_MKLDNN
         test_MKLDNN.cpp
@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
 endif()
 
 if(NOT MOBILE_INFERENCE)
-################### test_ProtoDataProvider ############
-    add_unittest_without_exec(test_ProtoDataProvider
-        test_ProtoDataProvider.cpp)
-
-    # test_ProtoDataProvider will mkdir as same name,
-    # so if WORKING_DIRECTORY is default directory, then
-    # mkdir will get error.
-    add_test(NAME test_ProtoDataProvider
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-
 ################## test_Evaluator #######################
     add_unittest(test_Evaluator
         test_Evaluator.cpp)
@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2
    COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
+
+################# test_CompareSparse ##################
+add_unittest_without_exec(test_CompareSparse
+    test_CompareSparse.cpp)
+if(NOT ON_TRAVIS)
+  add_test(NAME test_CompareSparse
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+              ./.set_port.sh -p port -n 6
+                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
+
+################ test_CompareTwoNets ######################
+add_unittest_without_exec(test_CompareTwoNets
+    test_CompareTwoNets.cpp)
+add_test(NAME test_CompareTwoNets
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index ca55a45bc77b4e171619ab788d7c7dfeefcd036a..9d61533c0b6f20c41130d7b7c15ad93392b2d24c 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief test the functionality of Mkldnnlayers
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
  * refer to paddle original function
  */
 class MKLDNNTester {
diff --git a/paddle/gserver/tests/proto_files.txt b/paddle/gserver/tests/proto_files.txt
deleted file mode 100644
index 691b38c7940bd21360eb00384e060554aa4b3e22..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/proto_files.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin
-./test_ProtoDataProvider/data2.bin
diff --git a/paddle/gserver/tests/proto_files_compressed.txt b/paddle/gserver/tests/proto_files_compressed.txt
deleted file mode 100644
index 7413c81e185d02e0d03aefa06480b9722357c5eb..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/proto_files_compressed.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin.gz
-./test_ProtoDataProvider/data2.bin.gz
diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..f49a827f22edce056eaf9903e99b732cab7f3784
--- /dev/null
+++ b/paddle/gserver/tests/sequence_lstm.conf
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
+
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..4895df186bfecc5cb5263676a9cd5bac5039d565
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent_group.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1d54542e3bc4e89f70d31d5e89c0f44953c9f90
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent_group.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+
+
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
similarity index 98%
rename from paddle/trainer/tests/test_CompareSparse.cpp
rename to paddle/gserver/tests/test_CompareSparse.cpp
index 5f1834bd730375fc10762fc19788d0c693f8e752..c6e07650fc4805a25baf38b9059f6c996d00cafc 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -22,8 +22,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-static const string& configFile1 =
-    "trainer/tests/sample_trainer_config_compare_sparse.conf";
+static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
 
 DECLARE_bool(use_gpu);
 DECLARE_string(config);
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp
similarity index 95%
rename from paddle/trainer/tests/test_CompareTwoNets.cpp
rename to paddle/gserver/tests/test_CompareTwoNets.cpp
index 94f65e545d116c802fb4877dc14f07aaaf83a4fb..801d9607565910b1f7f68a9c4532de5877e44f30 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/gserver/tests/test_CompareTwoNets.cpp
@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
 DECLARE_string(config);
 DECLARE_string(nics);
 
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
 DEFINE_bool(need_high_accuracy,
             false,
             "whether need to run in double accuracy");
@@ -42,6 +40,10 @@ DEFINE_double(
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_int32(seed);
 
+static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "gserver/tests/sequence_recurrent_group.py";
+
 struct ComData {
   vector<Argument> outArgs;
   vector<ParameterPtr> parameters;
@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
   DataBatch dataBatch;
   int32_t batchSize = trainer.getConfig().opt_config().batch_size();
 
+  trainer.getDataProvider()->reset();
   trainer.getDataProvider()->setSkipShuffle();
   trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
 
@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
 TEST(Trainer, create) {
   ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
+  calcGradient(dataA, config_file_a);
   LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
 
   ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
+  calcGradient(dataB, config_file_b);
   LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
 
   compareGradient(dataA, dataB);
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 3517d293e3c901caaa19952b04e56d1ef0d2b46e..c5359f272b4bed4d4d2483bf19d7ae482b0d33dd 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -583,6 +583,7 @@ TEST(Layer, maxoutLayer) {
     testLayerGrad(config, "maxout", 10, false, useGpu);
   }
 }
+
 void testFcLayer(string format, size_t nnz) {
   TestConfig config;
   config.biasSize = 1024;
@@ -680,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  // Not support GPU now
-  testLayerGrad(config,
-                "hsigmoid",
-                100,
-                /* trans */ false, /* useGpu */
-                false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
 }
 
 TEST(Layer, multi_cross) {
@@ -1081,6 +1083,21 @@ TEST(Layer, InterpolationLayer) {
   }
 }
 
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
+
 TEST(Layer, OuterProdLayer) {
   TestConfig config;
   config.layerConfig.set_type("out_prod");
@@ -2429,6 +2446,44 @@ TEST(Layer, ScaleSubRegionLayer) {
   }
 }
 
+TEST(Layer, L2DistanceLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("l2_distance");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  const size_t input_dim = 27;
+  const size_t batch_size = 11;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
+  }
+}
+
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index a859e34c8996d81f14bf1edcb6e23d5a4f687e6b..56b523f220c2a405851b89db5f63e9aa50bfaaf7 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
   testAddtoLayer({4, 12, 1, 1}, 3);
 }
 
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
+
 void testActivation(std::string actType, const testImageDesc& pm) {
   // TODO(TJ): remove me when paddle support elu activation
   if (actType == "mkldnn_elu") {
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
deleted file mode 100644
index af6472619d1840e82787974d265d601b4a406c09..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ /dev/null
@@ -1,732 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/gserver/dataproviders/ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;  // NOLINT
-
-std::vector<string> protoFiles{
-    "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin",
-};
-std::vector<string> protoFilesCompressed{
-    "./test_ProtoDataProvider/data1.bin.gz",
-    "./test_ProtoDataProvider/data2.bin.gz",
-};
-
-const char* kTestDir = "./test_ProtoDataProvider";
-const char kProtoFileList[] = "gserver/tests/proto_files.txt";
-const char kProtoFileListCompressed[] =
-    "gserver/tests/proto_files_compressed.txt";
-const int kSpraseMatrixDim = 1024;
-
-using namespace paddle;  // NOLINT
-
-void prepareData(DataBatch* batch,
-                 const int* numPerSlotType,
-                 bool iid,
-                 bool useGpu) {
-  batch->clear();
-  int64_t size = uniformRandom(100) + 10;
-  batch->setSize(size);
-
-  ICpuGpuVectorPtr sequenceStartPositions;
-  ICpuGpuVectorPtr subSequenceStartPositions;
-  if (!iid) {
-    int numSeqs = uniformRandom(10) + 1;
-    sequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* buf = sequenceStartPositions->getMutableData(false);
-    subSequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* subBuf = subSequenceStartPositions->getMutableData(false);
-    int64_t pos = 0;
-    int maxLen = 2 * size / numSeqs;
-    for (int i = 0; i < numSeqs; ++i) {
-      int len =
-          uniformRandom(min<int64_t>(maxLen, size - pos - numSeqs + i)) + 1;
-      buf[i] = pos;
-      subBuf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = size;
-    subBuf[numSeqs] = size;
-  }
-
-  vector<Argument>& arguments = batch->getStreams();
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false);
-    mat->randomizeUniform();
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arg.subSequenceStartPositions = subSequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    SVectorPtr vec = std::make_shared<std::vector<std::string>>();
-    for (int j = 0; j < size; ++j) {
-      vec->push_back(randStr(dim));
-    }
-    Argument arg;
-    arg.strs = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    IVectorPtr vec = IVector::create(size, /* useGpu= */ false);
-    int* buf = vec->getData();
-    for (int j = 0; j < size; ++j) {
-      buf[j] = uniformRandom(dim);
-    }
-    Argument arg;
-    arg.ids = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-}
-
-inline int getSlotDim(const Argument& arg) {
-  if (arg.value) {
-    return arg.value->getWidth();
-  } else if (arg.ids) {
-    return arg.ids->getMax() + 1;
-  } else if (arg.strs) {
-    return 1;
-  }
-  LOG(FATAL) << "Invalid argument";
-  return 0;
-}
-
-inline SlotDef::SlotType getSlotType(const Argument& arg) {
-  if (arg.value) {
-    auto& m = *arg.value;
-    auto& type = typeid(m);
-    if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
-      return SlotDef::VECTOR_DENSE;
-    }
-    if (type == typeid(CpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<CpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-    if (type == typeid(GpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<GpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-
-    LOG(FATAL) << "Unknown matrix type";
-  }
-  if (arg.ids) return SlotDef::INDEX;
-  if (arg.strs) return SlotDef::STRING;
-  LOG(FATAL) << "Invalid argument";
-  return SlotDef::VECTOR_DENSE;
-}
-
-void getColRow(const Argument& arg,
-               int64_t pos,
-               bool useGpu,
-               int* colNum,
-               const int** rowCols,
-               const real** rowValues) {
-  SlotDef::SlotType type = getSlotType(arg);
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos);
-  *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos);
-  if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-    *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos);
-  } else {
-    *rowValues = NULL;
-  }
-}
-
-void makeSample(const vector<Argument>& arguments,
-                int64_t pos,
-                bool isBeginning,
-                DataSample* sample,
-                bool useGpu) {
-  sample->set_is_beginning(isBeginning);
-  int slotid = 0;
-  for (auto& arg : arguments) {
-    SlotDef::SlotType type = getSlotType(arg);
-    int64_t dim = getSlotDim(arg);
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        values->Reserve(dim);
-        for (int i = 0; i < dim; ++i) {
-          values->AddAlreadyReserved(
-              static_cast<float>(arg.value->getElement(pos, i)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        sample->add_id_slots(arg.ids->get(pos));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;  // nullptr
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-        }
-        SubseqSlot* subseqSlot = sample->add_subseq_slots();  // subseq
-        subseqSlot->set_slot_id(slotid);
-        auto lens = subseqSlot->mutable_lens();
-        lens->Add(colNum);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        values->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-          values->AddAlreadyReserved(rowValues[i]);
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        vecSlot->add_strs((*arg.strs)[pos]);
-        break;
-      }
-    }
-    slotid++;
-  }
-}
-
-void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
-  DataHeader header;
-  const vector<Argument>& arguments = batch.getStreams();
-  for (auto& argument : arguments) {
-    SlotDef* slotDef = header.add_slot_defs();
-    slotDef->set_type(getSlotType(argument));
-    slotDef->set_dim(getSlotDim(argument));
-  }
-  VLOG(1) << "header=" << header.DebugString();
-
-  int64_t totalSeqs = batch.getNumSequences();
-  int64_t seq = 0;
-  ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
-  int64_t numWritten = 0;
-  vector<string> curProtoFiles =
-      dataCompression ? protoFilesCompressed : protoFiles;
-  for (size_t i = 0; i < curProtoFiles.size(); ++i) {
-    int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
-                      totalSeqs * i / curProtoFiles.size();
-    ofstream os(curProtoFiles[i]);
-    CHECK(os) << "Fail to open " << curProtoFiles[i];
-    unique_ptr<ProtoWriter> writer(new ProtoWriter(&os, dataCompression));
-    CHECK(writer->write(header));
-    for (int j = 0; j < numSeqs; ++j, ++seq) {
-      int64_t begin = seq;
-      int64_t end = seq + 1;
-      if (sequenceStartPositions) {
-        begin = sequenceStartPositions->getElement(seq);
-        end = sequenceStartPositions->getElement(seq + 1);
-      }
-      for (int pos = begin; pos < end; ++pos) {
-        DataSample sample;
-        makeSample(arguments, pos, pos == begin, &sample, useGpu);
-        CHECK(writer->write(sample));
-        ++numWritten;
-      }
-    }
-
-    writer.reset(nullptr);
-    os.close();
-  }
-  CHECK_EQ(arguments[0].getBatchSize(), numWritten);
-}
-
-// check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector<Argument>& args1,
-                 int64_t pos1,
-                 const vector<Argument>& args2,
-                 int64_t pos2,
-                 bool useGpu) {
-  EXPECT_EQ(args1.size(), args2.size());
-  VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
-
-  for (size_t i = 0; i < args1.size(); ++i) {
-    auto type = getSlotType(args1[i]);
-    int dim = getSlotDim(args1[i]);
-    EXPECT_EQ(type, getSlotType(args2[i]));
-    if (type == SlotDef::INDEX) {
-      EXPECT_GE(dim, getSlotDim(args2[i]));
-    } else {
-      EXPECT_EQ(dim, getSlotDim(args2[i]));
-    }
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        for (int j = 0; j < dim; ++j) {
-          EXPECT_EQ(static_cast<float>(args1[i].value->getElement(pos1, j)),
-                    static_cast<float>(args2[i].value->getElement(pos2, j)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        int colNum1, colNum2;
-        const int *rowCols1, *rowCols2;
-        const real *rowValues1, *rowValues2;
-        getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1);
-        getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2);
-        EXPECT_EQ(colNum1, colNum2);
-        for (int j = 0; j < colNum1; ++j) {
-          EXPECT_EQ(rowCols1[j], rowCols2[j]);
-          if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-            EXPECT_EQ(rowValues1[j], rowValues2[j]);
-          }
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]);
-        break;
-      }
-    }
-  }
-}
-
-void testProtoDataProvider(int* numPerSlotType,
-                           bool iid,
-                           bool async,
-                           bool useGpu,
-                           bool dataCompression,
-                           int numConstantSlots = 0) {
-  mkDir(kTestDir);
-  DataBatch data;
-
-  prepareData(&data, numPerSlotType, iid, useGpu);
-  writeData(data, useGpu, dataCompression);
-
-  DataConfig config;
-  config.set_type("proto");
-  config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
-  config.set_async_load_data(async);
-
-  for (int i = 0; i < numConstantSlots; ++i) {
-    config.add_constant_slots(i + 11);
-    MatrixPtr w = Matrix::create(data.getSize(),
-                                 1,
-                                 /* trans= */ false,
-                                 /* useGpu= */ false);
-    w->assign(config.constant_slots(i));
-    data.appendData(w);
-  }
-
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
-  int64_t batchSize = 10;
-  DataBatch batch;
-
-  size_t seq1 = 0;
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
-  dataProvider->reset();
-
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args2) {
-      EXPECT_EQ(iid, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    VLOG(1) << "numSeqs=" << numSeqs;
-    for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
-      int64_t begin1 = seq1;
-      int64_t end1 = seq1 + 1;
-      if (sequenceStartPositions1) {
-        begin1 = sequenceStartPositions1->getElement(seq1);
-        end1 = sequenceStartPositions1->getElement(seq1 + 1);
-        EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
-      }
-
-      int64_t begin2 = seq2;
-      int64_t end2 = seq2 + 1;
-      if (sequenceStartPositions2) {
-        begin2 = sequenceStartPositions2->getElement(seq2);
-        end2 = sequenceStartPositions2->getElement(seq2 + 1);
-      }
-      VLOG(1) << " begin1=" << begin1 << " end1=" << end1
-              << " begin2=" << begin2 << " end2=" << end2;
-      EXPECT_EQ(end1 - begin1, end2 - begin2);
-      for (int i = 0; i < end1 - begin1; ++i) {
-        checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
-      }
-    }
-  }
-
-  EXPECT_EQ(seq1, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-
-TEST(ProtoDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]);
-  const int numSlot = 5;
-  int combination[numSlot] = {0};
-  int k = numSlot - 1;
-  while (k >= 0) {
-    int numDenseVecSlots = numSlotsArray[combination[0]];
-    int numSparseNonValueVecSlots = numSlotsArray[combination[1]];
-    int numSparseValueVectorSlots = numSlotsArray[combination[2]];
-    int numStrSlots = numSlotsArray[combination[3]];
-    int numIdSlots = numSlotsArray[combination[4]];
-    // while loop : traverse all cases
-    k = numSlot - 1;
-    while (k >= 0) {
-      if (combination[k] < (numSlotsArraySize - 1)) {
-        ++combination[k];
-        break;
-      } else {
-        combination[k] = 0;
-        --k;
-      }
-    }
-    if (numDenseVecSlots + numSparseNonValueVecSlots +
-            numSparseValueVectorSlots + numStrSlots + numIdSlots <
-        1)
-      continue;
-    for (int iid : numTwoArray) {
-      for (int async : numTwoArray) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numSparseValueVectorSlots="
-                      << numSparseValueVectorSlots
-                      << " numStrSlots=" << numStrSlots
-                      << " numIdSlots=" << numIdSlots << " iid=" << iid
-                      << " async=" << async << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] =
-                numSparseValueVectorSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            numPerSlotType[SlotDef::STRING] = numStrSlots;
-            testProtoDataProvider(
-                numPerSlotType, iid, async, useGpu, dataCompression);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int async : numTwoArray)
-    }        // end for (int iid : numTwoArray)
-  }          // end for (while, traverse all slots)
-}
-
-TEST(ProtoDataProvider, constant_slots) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numDenseVecSlots : numSlotsArray) {
-    for (int numSparseNonValueVecSlots : numSlotsArray) {
-      if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue;
-      for (int numConstantSlots : {1, 2}) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numConstantSlogs=" << numConstantSlots
-                      << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1;
-            numPerSlotType[SlotDef::INDEX] = 1;
-            testProtoDataProvider(numPerSlotType,
-                                  /* iid= */ true,
-                                  /* async= */ false,
-                                  useGpu,
-                                  dataCompression,
-                                  numConstantSlots);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int numConstantSlots : {1, 2})
-    }        // end for (int numSparseNonValueVecSlots : numSlotsArray)
-  }          // end for (int numDenseVecSlots : numSlotsArray)
-}
-
-void checkSampleSequence(const vector<Argument>& args1,
-                         const vector<Argument>& args2,
-                         int64_t offset,
-                         int64_t numSeqs,
-                         bool useGpu) {
-  // check slot num are equal
-  EXPECT_EQ(args1.size(), args2.size());
-  for (size_t i = 0; i < args1.size(); i++) {
-    auto type = getSlotType(args1[i]);
-    // check for args2: sequenceStartPositions vs numSeqs
-    // (1) size
-    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
-    // (2) content
-    auto checkArgContent = [&](const Argument& args, int numSeqs) {
-      for (int j = 0; j <= numSeqs; j++) {
-        int start_pos = args.sequenceStartPositions->getElement(j);
-        EXPECT_EQ(start_pos, j);
-      }
-    };
-    switch (type) {
-      case SlotDef::INDEX: {
-        // args1: for label
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: ids are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs);
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j));
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // args1: for sparse_non_value
-        // args2 should put sparse indexes in ids
-        int colNum1;
-        const int* rowCols1;
-        const real* rowValues1;  // nullptr
-        int totalLength = 0;
-        for (int j = 0; j < numSeqs; j++) {
-          getColRow(
-              args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
-          // (1) lengths
-          EXPECT_EQ(totalLength,
-                    args2[i].sequenceStartPositions->getElement(j));
-          EXPECT_EQ(totalLength,
-                    args2[i].subSequenceStartPositions->getElement(j));
-          // (2) content
-          for (int k = 0; k < colNum1; k++) {
-            EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k));
-          }
-          totalLength += colNum1;
-          if (colNum1 == 0) {
-            // special case here: we will put a "-1" into ids when column num is
-            // zero. see ProtoSequenceDataProvider::getNextBatchInternal.
-            EXPECT_EQ(-1, args2[i].ids->get(totalLength));
-            totalLength++;
-          }
-        }
-        EXPECT_EQ(totalLength,
-                  args2[i].sequenceStartPositions->getElement(numSeqs));
-        EXPECT_EQ(totalLength,
-                  args2[i].subSequenceStartPositions->getElement(numSeqs));
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // args1: for dense vector
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: values are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs);
-        EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i]));
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          for (size_t k = 0; k < args2[i].value->getWidth(); k++) {
-            EXPECT_EQ(
-                static_cast<float>(args1[i].value->getElement(j + offset, k)),
-                static_cast<float>(args2[i].value->getElement(j, k)));
-          }
-        }
-        break;
-      }
-      default: { EXPECT_EQ(true, false) << "should not reach here"; }
-    }
-  }
-}
-
-void testProtoSequenceDataProvider(int* numPerSlotType,
-                                   bool async,
-                                   bool useGpu) {
-  mkDir(kTestDir);
-  DataBatch data;
-
-  prepareData(&data,
-              numPerSlotType,
-              /* iid */ true,
-              useGpu);
-  writeData(data, useGpu, /* dataCompression */ false);
-
-  DataConfig config;
-  config.set_type("proto_sequence");
-  config.set_files(kProtoFileList);
-  config.set_async_load_data(async);
-
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
-  int64_t batchSize = 10;
-  DataBatch batch;
-
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
-  dataProvider->reset();
-
-  size_t args1Offset = 0;
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args1) {
-      // args1 should not has sequence
-      EXPECT_EQ(true, !arg.sequenceStartPositions);
-    }
-    for (auto& arg : args2) {
-      // args2 should has sequence
-      EXPECT_NE(true, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
-    args1Offset += numSeqs;
-  }
-
-  EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-
-TEST(ProtoSequenceDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numSparseNonValueVecSlots : numSlotsArray) {
-    for (int numIdSlots : numSlotsArray) {
-      for (int numDenseVecSlots : numSlotsArray) {
-        if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1)
-          continue;
-        for (int async : numTwoArray) {
-          for (int useGpu : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numIdSlots=" << numIdSlots << " async=" << async
-                      << " useGpu=" << useGpu;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            testProtoSequenceDataProvider(numPerSlotType, async, useGpu);
-          }  // end for (int useGpu : numTwoArray)
-        }    // end for (int async : numTwoArray)
-      }      // end for (int numDenseVecSlots : numSlotsArray)
-    }        // end for (int numIdSlots : numSlotsArray)
-  }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
-}
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03bb1d6e2bdf86d8c56a8157938fb832..dc6979cf5a5229fb09866189f28217889d58c2d0 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
   os << ";";
 }
 
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
   CHECK_LE(elementCnt_, height_ * width_);
   if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index aad1348353d558abca72ed0fa5cf943237e3ac78..522b436a2a69179d3f4f17c919d5ba024102db7b 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -239,6 +239,15 @@ public:
               const unsigned int* cols,
               const real* values);
 
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
   void randomizeUniform();
 
   void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index aed5275dbf9be707cc6e19e729133ba8eab58195..8841c14ee083fccfd2271efd0c331805919a09d9 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(detail)
 
-cc_library(memory SRCS memory.cc DEPS place)
+cc_library(memory SRCS memory.cc DEPS place enforce)
 cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index a719da2560291dbc7e98aadfae41d4692d8afcad..937441b318095eadb9022c1d7578ad8aca2dadc8 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -61,6 +61,25 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    if ("${TARGET}" STREQUAL "compare_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+    endif()
+
+    # conv_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+    endif()
+
+    # conv_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_cudnn_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
+    endif()
+
     # pool_op contains several operators
     if ("${TARGET}" STREQUAL "pool_op")
         set(pybind_flag 1)
@@ -68,23 +87,23 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
     endif()
 
-    if ("${TARGET}" STREQUAL "compare_op")
+    # pool_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_cudnn_op")
         set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
     endif()
 
-    # pool_with_index_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_with_index_op")
+    if ("${TARGET}" STREQUAL "logical_op")
         set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
+        file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
     endif()
 
-    # conv_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_op")
+    # pool_with_index_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_with_index_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
     endif()
 
     # conv_transpose_op contains several operators
@@ -93,12 +112,12 @@ function(op_library TARGET)
         # It's enough to just adding one operator to pybind
         file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
     endif()
-    
-    # pool_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+
+    # conv_transpose_cudnn_op contains two operators
+    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
     endif()
 
     # save_restore_op contains several operators
@@ -166,12 +185,13 @@ set(DEPS_OPS
     cond_op
     cross_entropy_op
     recurrent_op
-    dynamic_recurrent_op
     softmax_with_cross_entropy_op
     softmax_op
     sequence_softmax_op
     sum_op
     pool_op
+    maxout_op
+    unpool_op
     pool_with_index_op
     conv_op
     conv_transpose_op
@@ -181,12 +201,29 @@ set(DEPS_OPS
     lod_rank_table_op
     lod_tensor_to_array_op
     array_to_lod_tensor_op
+    max_sequence_len_op
     lstm_op
     tensor_array_read_write_op
     gru_op
     adagrad_op
-    sgd_op)
+    sgd_op
+    save_op
+    load_op
+    send_op
+    recv_op)
 
+add_subdirectory(detail)
+op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    send_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    recv_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -198,10 +235,13 @@ op_library(sgd_op DEPS selected_rows_functor)
 op_library(adagrad_op DEPS selected_rows_functor)
 op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
+op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
 op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
+op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table)
 op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
@@ -211,15 +251,12 @@ op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
-if(WITH_TESTING)
-    op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
-        DEPS net_op tensor_array gtest)
-else()
-    op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
-            DEPS net_op tensor_array)
-endif()
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
 
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
@@ -227,15 +264,15 @@ endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
+
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
-        rnn/recurrent_op_utils.cc
-        DEPS dynamic_recurrent_op)
 if(WITH_GPU)
   cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 83d35a450d0e8ebf5311cdfd948b066642ccec8c..154c618e8e7c4650b7f22684d3357de9c52a416c 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -98,7 +98,6 @@ $y = \max(x, 0)$
   }
 };
 
-template <typename AttrType>
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   LeakyReluOpMaker(framework::OpProto *proto,
@@ -106,8 +105,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of LeakyRelu operator");
     AddOutput("Y", "Output of LeakyRelu operator");
-    AddAttr<AttrType>("alpha", "The small negative slope")
-        .SetDefault(static_cast<AttrType>(0.02f));
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
     AddComment(R"DOC(
 LeakyRelu Activation Operator.
 
@@ -117,7 +115,6 @@ $y = \max(x, \alpha * x)$
   }
 };
 
-template <typename AttrType>
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftShrinkOpMaker(framework::OpProto *proto,
@@ -125,8 +122,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softshrink operator");
     AddOutput("Y", "Output of Softshrink operator");
-    AddAttr<AttrType>("lambda", "non-negative offset")
-        .SetDefault(static_cast<AttrType>(0.5f));
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
 Softshrink Activation Operator.
 
@@ -173,7 +169,6 @@ $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
   }
 };
 
-template <typename AttrType>
 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   HardShrinkOpMaker(framework::OpProto *proto,
@@ -181,8 +176,8 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardShrink operator");
     AddOutput("Y", "Output of HardShrink operator");
-    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
-        .SetDefault(static_cast<AttrType>(0.5));
+    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
 HardShrink Activation Operator.
 
@@ -228,6 +223,51 @@ $y = |x|$
   }
 };
 
+class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CeilOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Ceil operator");
+    AddOutput("Y", "Output of Ceil operator");
+    AddComment(R"DOC(
+Ceil Activation Operator.
+
+$y = ceil(x)$
+
+)DOC");
+  }
+};
+
+class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FloorOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Floor operator");
+    AddOutput("Y", "Output of Floor operator");
+    AddComment(R"DOC(
+Floor Activation Operator.
+
+$y = floor(x)$
+
+)DOC");
+  }
+};
+
+class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RoundOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Round operator");
+    AddOutput("Y", "Output of Round operator");
+    AddComment(R"DOC(
+Round Activation Operator.
+
+$y = [x]$
+
+)DOC");
+  }
+};
+
 class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReciprocalOpMaker(framework::OpProto *proto,
@@ -308,17 +348,16 @@ $$y = \frac{x}{1 + |x|}$$
   }
 };
 
-template <typename AttrType>
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of BRelu operator");
     AddOutput("Y", "Output of BRelu operator");
-    AddAttr<AttrType>("t_min", "The min marginal value of BRelu")
-        .SetDefault(static_cast<AttrType>(0));
-    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
-        .SetDefault(static_cast<AttrType>(24));
+    AddAttr<float>("t_min", "The min marginal value of BRelu")
+        .SetDefault(static_cast<float>(0));
+    AddAttr<float>("t_max", "The max marginal value of BRelu")
+        .SetDefault(static_cast<float>(24));
     AddComment(R"DOC(
 BRelu Activation Operator.
 
@@ -328,7 +367,6 @@ $y = \max(\min(x, t_{min}), t_{max})$
   }
 };
 
-template <typename AttrType>
 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftReluOpMaker(framework::OpProto *proto,
@@ -336,8 +374,8 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of SoftRelu operator");
     AddOutput("Y", "Output of SoftRelu operator");
-    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
-        .SetDefault(static_cast<AttrType>(40));
+    AddAttr<float>("threshold", "The threshold value of SoftRelu")
+        .SetDefault(40.0f);
     AddComment(R"DOC(
 SoftRelu Activation Operator.
 
@@ -347,15 +385,13 @@ $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
   }
 };
 
-template <typename AttrType>
 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ELU operator");
     AddOutput("Y", "Output of ELU operator");
-    AddAttr<AttrType>("alpha", "The alpha value of ELU")
-        .SetDefault(static_cast<AttrType>(1.0f));
+    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
     AddComment(R"DOC(
 ELU Activation Operator.
 
@@ -368,15 +404,14 @@ $y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
   }
 };
 
-template <typename AttrType>
 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu6 operator");
     AddOutput("Y", "Output of Relu6 operator");
-    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
-        .SetDefault(static_cast<AttrType>(6));
+    AddAttr<float>("threshold", "The threshold value of Relu6")
+        .SetDefault(6.0f);
     AddComment(R"DOC(
 Relu6 Activation Operator.
 
@@ -386,15 +421,13 @@ $y = \min(\max(0, x), 6)$
   }
 };
 
-template <typename AttrType>
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Pow operator");
     AddOutput("Y", "Output of Pow operator");
-    AddAttr<AttrType>("factor", "The exponential factor of Pow")
-        .SetDefault(static_cast<AttrType>(1));
+    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
     AddComment(R"DOC(
 Pow Activation Operator.
 
@@ -404,17 +437,16 @@ $y = x^{factor}$
   }
 };
 
-template <typename AttrType>
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of STanh operator");
     AddOutput("Y", "Output of STanh operator");
-    AddAttr<AttrType>("scale_a", "The scale parameter of a for the input")
-        .SetDefault(static_cast<AttrType>(2 / 3));
-    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
-        .SetDefault(static_cast<AttrType>(1.7159));
+    AddAttr<float>("scale_a", "The scale parameter of a for the input")
+        .SetDefault(2.0f / 3.0f);
+    AddAttr<float>("scale_b", "The scale parameter of b for the input")
+        .SetDefault(1.7159f);
     AddComment(R"DOC(
 STanh Activation Operator.
 
@@ -424,7 +456,6 @@ $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
   }
 };
 
-template <typename AttrType>
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ThresholdedReluOpMaker(framework::OpProto *proto,
@@ -432,8 +463,8 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ThresholdedRelu operator");
     AddOutput("Y", "Output of ThresholdedRelu operator");
-    AddAttr<AttrType>("threshold", "The threshold location of activation")
-        .SetDefault(static_cast<AttrType>(1.0));
+    AddAttr<float>("threshold", "The threshold location of activation")
+        .SetDefault(1.0f);
     AddComment(R"DOC(
 ThresholdedRelu Activation Operator.
 
@@ -448,7 +479,6 @@ $$
   }
 };
 
-template <typename AttrType>
 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   HardSigmoidOpMaker(framework::OpProto *proto,
@@ -456,10 +486,10 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardSigmoid operator");
     AddOutput("Y", "Output of HardSigmoid operator");
-    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.2));
-    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.5));
+    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(0.2f);
+    AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
 HardSigmoid Activation Operator.
 
@@ -499,7 +529,7 @@ REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
             tanh_shrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker<float>,
+REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
             softshrink_grad, ops::ActivationOpGrad);
 
 REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
@@ -508,6 +538,15 @@ REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
 REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
             ops::ActivationOpGrad);
 
+REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
+            ops::ActivationOpGrad);
+
 REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
             reciprocal_grad, ops::ActivationOpGrad);
 
@@ -523,35 +562,34 @@ REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
 REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker<float>,
+REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
             leaky_relu_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
-            soft_relu_grad, ops::ActivationOpGrad);
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
+            ops::ActivationOpGrad);
 
-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker<float>, elu_grad,
+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker<float>, relu6_grad,
+REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker<float>,
+REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
             hard_shrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(thresholded_relu, ops::ActivationOp,
-            ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
+            thresholded_relu_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
             hard_sigmoid_grad, ops::ActivationOpGrad);
 
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index ceb4b4e40b67473f42e67e3f02f8e012e1b1eb50..8cd3bfbbd3f8f3210f94aef3a1586c8295730c1d 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -283,6 +283,41 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+// ceil(x) = ceiling(x)
+template <typename T>
+struct CeilFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.ceil();
+  }
+};
+
+template <typename T>
+struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = static_cast<T>(0) / x;
+  }
+};
+
+// floor(x) = flooring(x)
+template <typename T>
+struct FloorFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.ceil();
+  }
+};
+
+// round(x) = [x]
+template <typename T>
+struct RoundFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.round();
+  }
+};
+
 // abs(x) = |x|
 template <typename T>
 struct AbsFunctor : public BaseActivationFunctor<T> {
@@ -677,6 +712,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
   __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
+  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
   __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
   __macro(log, LogFunctor, LogGradFunctor);                          \
   __macro(square, SquareFunctor, SquareGradFunctor);                 \
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index b717e1647e4b89285b841420650dc69e8a1e0c58..16a7794d5b7bf1d56cd9f5874454c41cab43b41f 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -109,4 +109,5 @@ paramOut = param + paramUpdate$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
index 3af1c8c8e9861138a33b3156818f704c3b20363f..9fb61852071f11670b8bc51321bb0881de196777 100644
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -17,4 +17,5 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
+    ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
index d29e15c43583bd447fbacb548a326f303f7d1463..a8c5f0c8aa20ce506f5279fa696079ba64034bd5 100644
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
@@ -33,8 +33,8 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
     avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
     avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float rho = ctx.Attr<float>("rho");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T rho = static_cast<T>(ctx.Attr<float>("rho"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
     auto param = framework::EigenVector<T>::Flatten(
         *ctx.Input<framework::Tensor>("Param"));
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 5b869e6bc5f4604ba6055ffd62fa21e4a1f41b93..1c870214b29dbfcabb7414317b1214d6bef369cb 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -14,8 +14,8 @@
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adagrad_op.h"
-#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/platform/cuda_helper.h"
 
 namespace paddle {
@@ -134,8 +134,8 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
                       .stream()>>>(grad_merge_data, grad_merge->rows().data(),
-                                   lr, param_data,
-                                   moment_data, grad_width, epsilon);
+                                   lr, param_data, moment_data, grad_width,
+                                   epsilon);
   }
 };
 
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index 97a091ae766abfba5412bbd32c34a6f80701fbf7..03faa2a7c5a486cb0d2b6f2f10d140eeb4c6c04e 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -127,4 +127,5 @@ paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
 REGISTER_OP_CPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
index a3def912e540454275350209435eb01ae2151331..6e34f7818ce20c75692fe21776721ce200b7a147 100644
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@@ -17,4 +17,5 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>);
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 45938006db1231a7a134964d729df6ca114d4dbe..7f7fa1da1c0d8d81d1bcb18a1bf542838eddccf7 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -31,9 +31,9 @@ class AdamOpKernel : public framework::OpKernel<T> {
     moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float beta1 = ctx.Attr<float>("beta1");
-    float beta2 = ctx.Attr<float>("beta2");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
     auto param = framework::EigenVector<T>::Flatten(
         *ctx.Input<framework::Tensor>("Param"));
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index 14cf3841b33a8153549e4c99ed2b75286e9c64db..d5bbc672e18f392d6a91383b919fefc4b2d8ff0e 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -126,4 +126,5 @@ division by 0 error.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
 REGISTER_OP_CPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
index fee3b6fc6b656917d79b84f48da8e63be7683890..057ef39025aa23704457ef7bbe54934d06cdc87f 100644
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -17,4 +17,5 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>);
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
index 2c99832ec08e9c1d9b5458c467d5238f9b1b3c37..bf36ed78604dd88c537db51fbeb38f43d0c46173 100644
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@@ -31,9 +31,9 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     moment_out_tensor->mutable_data<T>(ctx.GetPlace());
     inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float beta1 = ctx.Attr<float>("beta1");
-    float beta2 = ctx.Attr<float>("beta2");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
     auto param = framework::EigenVector<T>::Flatten(
         *ctx.Input<framework::Tensor>("Param"));
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 233a81198e336d3190565fb18556f96979cec0ce..1f2b4fdb4b4a99d5baf5de1cc226dc196ab4eb2e 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -36,7 +36,7 @@ class ArrayOp : public framework::OperatorBase {
     if (platform::is_gpu_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
       framework::Tensor t;
-      t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+      framework::CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx, &t);
       dev_ctx.Wait();
       offset = static_cast<size_t>(*t.data<int64_t>());
     } else {
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index c0903bb4e5ca7f160e19eefab99af7e3e4a8ed76..faeba7f3ed26d05de16775a1de4d42f802111207 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -102,8 +102,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
         if (len == 0) {
           continue;
         }
-        out->Slice(out_offset, out_offset + len)
-            .CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx);
+        auto slice = out->Slice(out_offset, out_offset + len);
+        framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
+                            dev_ctx, &slice);
         out_offset += len;
       }
     }
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index 609e915b932e2bc4d5abee1e5f868cc07a7619d3..0a37f18729a93b15623c0a17e3689e518c38b844 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -43,7 +43,8 @@ class AssignFunctor {
     out_rows.set_rows(rows.rows());
     out_rows.set_height(rows.height());
     auto &t = rows.value();
-    out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_);
+    auto *m = out_rows.mutable_value();
+    framework::CopyFrom(t, t.place(), dev_ctx_, m);
   }
 
   template <typename T>
@@ -55,7 +56,7 @@ class AssignFunctor {
   void copy_tensor(const framework::LoDTensor &lod_tensor,
                    framework::LoDTensor *out) const {
     auto &out_tensor = *out;
-    out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_);
+    CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
   }
 
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index f884e6efa917ce3f8554dce0e248f2b29273e3f3..ac97bd83ab7e7838871586cfe5acb832084b6cec 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
     const auto x_dims = ctx->GetInputDim("X");
     const TensorFormat tensor_format =
         StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                              : x_dims[x_dims.size() - 1]);
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input X must have 3 to 5 dimensions.");
-
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index 726d1ea1b8d7ced93f94bb0e5bb4df9e43b0ac7b..7b2f3187007fa2491afa75de1cde1910c6ce9bb8 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -29,14 +29,21 @@ void ExtractNCWHD(const framework::DDim &dims,
                   const TensorFormat &tensor_format, int *N, int *C, int *H,
                   int *W, int *D) {
   *N = dims[0];
-  *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-  *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
-  *W = dims.size() > 3
-           ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
-           : 1;
-  *D = dims.size() > 4
-           ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
-           : 1;
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
+    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+             : 1;
+  }
 }
 
 template <typename T>
@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 
     const auto &x_dims = x->dims();
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
index 3904a97d58166cfeeb2be7d2144700dbd8bc5721..c796a0c5d089499e7858c7a427825fdbeb05cb7f 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -17,6 +17,36 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+struct BeamSearchDecodeFunctor {
+  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
+                          const LoDTensorArray& step_scores,
+                          LoDTensor* id_tensor, LoDTensor* score_tensor)
+      : step_ids_(step_ids),
+        step_scores_(step_scores),
+        id_tensor_(id_tensor),
+        score_tensor_(score_tensor) {}
+
+  template <typename T>
+  void operator()() const;
+
+  const LoDTensorArray& step_ids_;
+  const LoDTensorArray& step_scores_;
+  LoDTensor* id_tensor_;
+  LoDTensor* score_tensor_;
+};
+
+template <typename T>
+void BeamSearchDecodeFunctor::operator()() const {
+  BeamSearchDecoder<T> beam_search_decoder;
+  beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
+                                   score_tensor_);
+}
+
+template <>
+void BeamSearchDecodeFunctor::operator()<bool>() const {
+  PADDLE_THROW("beam search decode op does not support bool!");
+}
+
 class BeamSearchDecodeOp : public framework::OperatorBase {
  public:
   BeamSearchDecodeOp(const std::string& type,
@@ -45,9 +75,9 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
     LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
     LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
 
-    BeamSearchDecoder<float> beam_search_decoder;
-    beam_search_decoder.PackAllSteps(*ids, *scores, sentenceIds,
-                                     sentenceScores);
+    framework::VisitDataType(
+        framework::ToDataType(scores->at(0).type()),
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
   }
 };
 
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
index 0f007ec22f9a66572971516a711317f348e1ec5a..3b1c6cd7a1045bfbb896725c79dc1ae2e22f43dc 100644
--- a/paddle/operators/beam_search_decode_op.h
+++ b/paddle/operators/beam_search_decode_op.h
@@ -232,12 +232,12 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   id_tensor->set_lod(lod);
   id_tensor->Resize({static_cast<int64_t>(id_data.size())});
   id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
-  id_tensor->CopyFromVector<int64_t>(id_data, cpu_ctx);
+  framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
 
   score_tensor->set_lod(lod);
   score_tensor->Resize({static_cast<int64_t>(score_data.size())});
   score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
-  score_tensor->CopyFromVector<T>(score_data, cpu_ctx);
+  framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
 }
 
 template <typename T>
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
index 17926a813d5b0b8ace6a1b20066cd0007703c696..8c3e2a303fb8f12a8886c11cf112b859a6db7bcf 100644
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -139,7 +139,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
   items->reserve(framework::product(ids.dims()));
   for (size_t offset = abs_lod[lod_level_][sent_offset_];
        offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (int d = 0; d < instance_dim; d++) {
+    for (size_t d = 0; d < instance_dim; d++) {
       const size_t dim_offset = offset * instance_dim + d;
       items->emplace_back(offset, ids_data[dim_offset],
                           scores_data[dim_offset]);
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
index c65ba7eb262f3aabe2c00837b79806c0b40b60fd..c88b2c9beb4497b617078c8ac5582d2f246f43fd 100644
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -77,11 +77,19 @@ class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output of bilinear_tensor_product operator.");
     AddComment(R"DOC(
 Bilinear Tensor Product operator.
-Given input X and Y, a 3D tensor weight, and bias. Each column of the
-output is computed by one slice i = 1, . . . , k of the tensor:
-
-    M =  (X W_i) \cdot Y
-    Out_i = \sum_i {M_i} + Bias_i
+Given input X and Y, a 3D tensor Weight and a Bias. Each column of the
+Output is computed by one slice $i = 1, . . . , k$ of the tensor:
+
+$$
+M =  (X W_i) * Y \\
+Out_i = \sum_j {M_j} + Bias_i
+$$
+
+Where $W_i$ is the $i$-th slice of Input(Weight);
+      $M_j$ is the $j$-th column of $M$;
+      $Out_i$ is the $i$-th column of Output(Out);
+      $Bias_i$ is a column vector, each element of it is equal to
+        the $i$-th element of $Bias$;
 
 )DOC");
   }
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index 70ee7861bab3a982eae60dd85b10c2e41f5827d0..3082a53ccfbe4f8666cfdfc2efed6b46ffdfede9 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -25,8 +25,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of cast op");
     AddOutput("Out", "The output tensor of cast op");
-    AddAttr<int>("out_data_type", "output data type");
-    AddAttr<int>("in_data_type", "input data type");
+    AddAttr<int>("out_dtype", "output data type");
+    AddAttr<int>("in_dtype", "input data type");
     AddComment(R"DOC(
 Cast Operator.
 
@@ -58,8 +58,8 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
     grad->SetType("cast");
     grad->SetInput("X", OutputGrad("Out"));
     grad->SetOutput("Out", InputGrad("X"));
-    grad->SetAttr("out_data_type", GetAttr("in_data_type"));
-    grad->SetAttr("in_data_type", GetAttr("out_data_type"));
+    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
+    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
     return std::unique_ptr<framework::OpDescBind>(grad);
   }
 };
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
index ffdbff7030afedab2efc06479ac86ad70c185f48..850dc8e3498351e54d41fcd2b6596c6fe668df14 100644
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -55,7 +55,7 @@ class CastOpKernel : public framework::OpKernel<InT> {
     auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     framework::VisitDataType(
-        static_cast<framework::DataType>(context.Attr<int>("out_data_type")),
+        static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
         CastOpFunctor<Place, InT>(in, out, context.device_context()));
   }
 };
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
index 4c65b60d2349d2989128f4b1da705ea18391b8a3..0dd8c13b2ad6ff206066ccb98a4c009e4c3b4fd0 100644
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -17,10 +17,10 @@
 namespace paddle {
 namespace operators {
 
-class CudnnConvOpMaker : public Conv2DOpMaker {
+class CudnnConv2DOpMaker : public Conv2DOpMaker {
  public:
-  CudnnConvOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  CudnnConv2DOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
       : Conv2DOpMaker(proto, op_checker) {
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
@@ -32,15 +32,43 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
   }
 };
 
+class CudnnConv3DOpMaker : public Conv3DOpMaker {
+ public:
+  CudnnConv3DOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : Conv3DOpMaker(proto, op_checker) {
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardware. This size should be chosen carefully.")
+        .SetDefault(4096);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
-            ops::ConvOpGrad);
+REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker,
+            conv2d_cudnn_grad, ops::ConvOpGrad);
+
+REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker,
+            conv3d_cudnn_grad, ops::ConvOpGrad);
+
+REGISTER_OP_CPU_KERNEL(conv2d_cudnn,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_cudnn_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
 
-REGISTER_OP_CPU_KERNEL(conv_cudnn,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(conv3d_cudnn,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv_cudnn_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv3d_cudnn_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 2aec4a2760260623c4c7054c590afa8e1c6c3fea..3f97dc7ee0a61944a8a57314b5ec7f33df619bf3 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -56,6 +56,21 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     ScopedFilterDescriptor filter_desc;
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
@@ -63,19 +78,34 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
         layout, framework::vectorize2int(output->dims()), groups);
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()), groups);
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
 
     int input_channels = input->dims()[1];
-    int input_height = input->dims()[2];
-    int input_width = input->dims()[3];
-    int output_channels = output->dims()[1];
-    int output_height = output->dims()[2];
-    int output_width = output->dims()[3];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+    int output_channels = filter->dims()[0];
+    int output_height, output_width, output_depth;
+    if (output->dims().size() == 5) {
+      output_depth = output->dims()[2];
+      output_height = output->dims()[3];
+      output_width = output->dims()[4];
+    } else {
+      output_depth = 1;
+      output_height = output->dims()[2];
+      output_width = output->dims()[3];
+    }
 
-    int group_offset_in = input_channels / groups * input_height * input_width;
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
     int group_offset_out =
-        output_channels / groups * output_height * output_width;
+        output_channels / groups * output_height * output_width * output_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn conv workspace ---------------------
     void* cudnn_workspace = nullptr;
@@ -138,12 +168,26 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_grad_desc;
-    ScopedTensorDescriptor input_grad_desc;
 
     ScopedFilterDescriptor filter_desc;
     ScopedFilterDescriptor filter_grad_desc;
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
@@ -152,22 +196,35 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
             layout, framework::vectorize2int(output_grad->dims()), groups);
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()), groups);
-    cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
-    cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
 
     int input_channels = input->dims()[1];
-    int input_height = input->dims()[2];
-    int input_width = input->dims()[3];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+
     int output_grad_channels = filter->dims()[0];
-    int output_grad_height = output_grad->dims()[2];
-    int output_grad_width = output_grad->dims()[3];
+    int output_grad_height, output_grad_width, output_grad_depth;
+    if (input->dims().size() == 5) {
+      output_grad_depth = output_grad->dims()[2];
+      output_grad_height = output_grad->dims()[3];
+      output_grad_width = output_grad->dims()[4];
+    } else {
+      output_grad_depth = 1;
+      output_grad_height = output_grad->dims()[2];
+      output_grad_width = output_grad->dims()[3];
+    }
 
-    int group_offset_in = input_channels / groups * input_height * input_width;
-    int group_offset_out =
-        output_grad_channels / groups * output_grad_height * output_grad_width;
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
+    int group_offset_out = output_grad_channels / groups * output_grad_height *
+                           output_grad_width * output_grad_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn backward algorithm ---------------------
     cudnnConvolutionBwdDataAlgo_t data_algo;
@@ -180,8 +237,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 
     auto handle = ctx.cuda_device_context().cudnn_handle();
     if (input_grad) {
-      cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
-          layout, framework::vectorize2int(input_grad->dims()), groups);
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
               handle, cudnn_filter_desc,
@@ -190,19 +245,17 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
               cudnn_output_grad_desc, cudnn_conv_desc,
               // dxDesc: Handle to the previously initialized output tensor
               // descriptor.
-              cudnn_input_grad_desc,
+              cudnn_input_desc,
               CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
               workspace_size_limit, &data_algo));
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
-              cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
+              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
 
     if (filter_grad) {
-      cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
-          layout, framework::vectorize2int(filter_grad->dims()), groups);
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
@@ -222,34 +275,30 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
-    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset input_grad.
+
       for (int i = 0; i < groups; i++) {
         PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
             handle, &alpha, cudnn_filter_desc,
             filter_data + i * group_offset_filter, cudnn_output_grad_desc,
             output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta,
-            cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + i * group_offset_in));
       }
     }
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
         PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
             cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
             cudnn_conv_desc, filter_algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
+            workspace_size_in_bytes, &beta, cudnn_filter_desc,
             filter_grad_data + i * group_offset_filter));
       }
     }
@@ -261,6 +310,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>);
-REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
-                       paddle::operators::CudnnConvGradOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv2d_cudnn,
+                       paddle::operators::CudnnConvOpKernel<float>,
+                       paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad,
+                       paddle::operators::CudnnConvGradOpKernel<float>,
+                       paddle::operators::CudnnConvGradOpKernel<double>);
+
+REGISTER_OP_GPU_KERNEL(conv3d_cudnn,
+                       paddle::operators::CudnnConvOpKernel<float>,
+                       paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad,
+                       paddle::operators::CudnnConvGradOpKernel<float>,
+                       paddle::operators::CudnnConvGradOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 687d741cb22a081eab18c61752200b9fd48f68a7..462e6d9cbcbe61d9911efe8beff4446620e1e932 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
       .SetDefault({0, 0});
   AddAttr<int>(
       "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
       "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
       "when group=2, the first half of the filters is only connected to the "
       "first half of the input channels, while the second half of the filters "
@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
 Convolution Operator.
 
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups, dilations parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
 size, C is the number of channels, H is the height of the feature, and W is
-the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements.
-These two elements represent height and width, respectively.
+the width of the feature.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       Input shape: (N, C_in, H_in, W_in)
-       Filter shape: (C_out, C_in, H_f, W_f)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, H_out, W_out)
-  where
-       H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1;
-       W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1;
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
 )DOC");
 }
 
@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
       .SetDefault({0, 0, 0});
   AddAttr<int>(
       "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
       "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
       "when group=2, the first half of the filters is only connected to the "
       "first half of the input channels, while the second half of the filters "
@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
   AddAttr<std::vector<int>>("dilations",
                             "(vector<int> default:{1, 1, 1}), the "
                             "dilations(d_dilation, h_dilation, w_dilation) of "
-                            "convolution operator. Currently, conv3d doesn't "
-                            "support dilation.")
+                            "convolution operator.")
       .SetDefault({1, 1, 1});
 
   AddComment(R"DOC(
 Convolution3D Operator.
 
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+Input(Input) and output(Output) are in NCDHW format, where N is batch
 size, C is the number of channels,D is the depth of the feature, H is the height of
-the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
-are three elements. These three elements represent depth, height and width, respectively.
+the feature, and W is the width of the feature.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
-       Filter shape: (C_out, C_in, D_f, H_f, W_f)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
-  where
-       D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
+       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+  $$
 )DOC");
 }
 
@@ -225,11 +236,15 @@ REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
 REGISTER_OP_CPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
 
 REGISTER_OP_CPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 8e6f9da455b7291049aee57189dae15b8bcc2150..546451234a1ed1a4d3119cb175c6d37ae3f0aac1 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -17,11 +17,15 @@
 namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
 
 REGISTER_OP_GPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index fac5f1d0e25fe205f89fc7eeb9fadfd8431517d5..09bff0a68db82aa723dc08aa83c775910e17c5b8 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -38,7 +38,7 @@ inline bool IsExpand(std::vector<int64_t>& filter_dim,
                      std::vector<int>& dilations) {
   bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
   for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j]) == 1);
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
     strides_1 = strides_1 && (strides[j] == 1);
     padding_0 = padding_0 && (paddings[j] == 0);
     dilation_1 = dilation_1 && (dilations[j] == 1);
@@ -91,32 +91,28 @@ class GemmConvKernel : public framework::OpKernel<T> {
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
-    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    filter_shape_vec.erase(filter_shape_vec.begin(),
-                           filter_shape_vec.begin() + 2);
-
-    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
     std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-    output_shape_vec.erase(output_shape_vec.begin(),
-                           output_shape_vec.begin() + 2);
 
     // use col_shape in the im2col calculation
     // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
     // o_h, o_w}
-    std::vector<int64_t> col_shape_vec;
-    col_shape_vec.push_back(input->dims()[1] / groups);
-    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-                         filter_shape_vec.end());
-    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
-                         output_shape_vec.end());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
     framework::DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
     // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
     // o_h * o_w)
     framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+        framework::flatten_to_2d(col_shape, data_dim + 1);
 
     bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
     Tensor col;
@@ -159,13 +155,13 @@ class GemmConvKernel : public framework::OpKernel<T> {
           col.ShareDataWith(in_slice);
           col_matrix.ShareDataWith(col);
           col_matrix.Resize(col_matrix_shape);
-        } else if (filter_shape_vec.size() == 2) {
+        } else if (data_dim == 2U) {
           // im2col
           im2col(context.device_context(), in_slice, dilations, strides,
                  std::vector<int>{paddings[0], paddings[1], paddings[0],
                                   paddings[1]},
                  &col);
-        } else if (filter_shape_vec.size() == 3) {
+        } else if (data_dim == 3U) {
           // vol2col
           vol2col(context.device_context(), in_slice, dilations, strides,
                   paddings, &col);
@@ -206,26 +202,22 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
-    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    filter_shape_vec.erase(filter_shape_vec.begin(),
-                           filter_shape_vec.begin() + 2);
-
-    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
     std::vector<int64_t> output_shape_vec(
         framework::vectorize(output_grad->dims()));
-    output_shape_vec.erase(output_shape_vec.begin(),
-                           output_shape_vec.begin() + 2);
 
     // use col_shape in the im2col calculation
     // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
     // o_h, o_w}
-    std::vector<int64_t> col_shape_vec;
-    col_shape_vec.push_back(input->dims()[1] / groups);
-    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-                         filter_shape_vec.end());
-    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
-                         output_shape_vec.end());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
     framework::DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
@@ -233,7 +225,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     // or
     // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
     framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+        framework::flatten_to_2d(col_shape, data_dim + 1);
 
     framework::DDim input_shape = framework::slice_ddim(
         input->dims(), 1, static_cast<int>(input->dims().size()));
@@ -294,12 +286,12 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
                                  out_grad_slice, false, T(1.0), &col_matrix,
                                  T(0.0));
 
-          if (is_expand && filter_shape_vec.size() == 2) {
+          if (is_expand && data_dim == 2U) {
             col2im(context.device_context(), col, dilations, strides,
                    std::vector<int>{paddings[0], paddings[1], paddings[0],
                                     paddings[1]},
                    &in_grad_slice);
-          } else if (is_expand && filter_shape_vec.size() == 3) {
+          } else if (is_expand && data_dim == 3U) {
             col2vol(context.device_context(), col, dilations, strides, paddings,
                     &in_grad_slice);
           }
@@ -328,12 +320,12 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col.ShareDataWith(in_slice);
             col_matrix.ShareDataWith(col);
             col_matrix.Resize(col_matrix_shape);
-          } else if (filter_shape_vec.size() == 2) {
+          } else if (data_dim == 2U) {
             im2col(context.device_context(), in_slice, dilations, strides,
                    std::vector<int>{paddings[0], paddings[1], paddings[0],
                                     paddings[1]},
                    &col);
-          } else if (filter_shape_vec.size() == 3) {
+          } else if (data_dim == 3U) {
             vol2col(context.device_context(), in_slice, dilations, strides,
                     paddings, &col);
           }
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc
similarity index 55%
rename from paddle/operators/conv2d_transpose_cudnn_op.cc
rename to paddle/operators/conv_transpose_cudnn_op.cc
index fce1357ce5af5f11ccc5941690431393301e6725..0192178ce3a0a47196232f0723baec8324bea60b 100644
--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -23,7 +23,24 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
                               framework::OpAttrChecker* op_checker)
       : Conv2DTransposeOpMaker(proto, op_checker) {
     AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault(std::vector<int>{1, 1});
+        .SetDefault({1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+
+class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
+ public:
+  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
+                              framework::OpAttrChecker* op_checker)
+      : Conv3DTransposeOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault({1, 1, 1});
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
                  "workspace is a section of GPU memory which will be "
@@ -44,7 +61,22 @@ REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+
+REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
+            ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
+            ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc
similarity index 89%
rename from paddle/operators/conv2d_transpose_cudnn_op.cu.cc
rename to paddle/operators/conv_transpose_cudnn_op.cu.cc
index eff058afc6cc5dacf2a054a33f352824865c1924..494904fe524ae30a5032e489a0c5f20179d8e8ce 100644
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -54,15 +54,21 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor output_desc;
     ScopedFilterDescriptor filter_desc;
     ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
 
-    // N, M, H, W
+    // (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
-    // N, C, O_h, O_w
+    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         layout, framework::vectorize2int(output->dims()));
-    // M, C, K_h, K_w
+    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()));
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
@@ -136,13 +142,13 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
 
-    // Input: (N, M, H, W)
+    // Input: (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
-    // Output: (N, C, O_H, O_W)
+    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         layout, framework::vectorize2int(output_grad->dims()));
-    // Filter (M, C, K_H, K_W)
+    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()));
 
@@ -200,8 +206,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::set_constant(ctx.device_context(), input_grad, 0);
-
+      // Because beta is zero, it is unnecessary to reset input_grad.
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_output_desc, output_grad_data,
           cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
@@ -212,8 +217,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      math::set_constant(ctx.device_context(), filter_grad, 0);
-
+      // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
           handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
@@ -231,6 +235,15 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
-                       ops::CudnnConvTransposeOpKernel<float>);
+                       ops::CudnnConvTransposeOpKernel<float>,
+                       ops::CudnnConvTransposeOpKernel<double>);
 REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
-                       ops::CudnnConvTransposeGradOpKernel<float>);
+                       ops::CudnnConvTransposeGradOpKernel<float>,
+                       ops::CudnnConvTransposeGradOpKernel<double>);
+
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
+                       ops::CudnnConvTransposeOpKernel<float>,
+                       ops::CudnnConvTransposeOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
+                       ops::CudnnConvTransposeGradOpKernel<float>,
+                       ops::CudnnConvTransposeGradOpKernel<double>);
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index 13ac0cd54cbeb8f68c2246f7e1d02f032266a72e..678b192dea78fc6b4a6b54c4bb09a55dfb8f9c38 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    PADDLE_ENFORCE_EQ(paddings[i], 0,
-                      "No Padding allowed in conv transpose op.");
-  }
-
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "ConvTransposeOp intput should be 4-D or 5-D tensor.");
   PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
@@ -44,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                  "ConvTransposeOp input dimension and strides dimension should "
                  "be consistent.");
   PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                    "ConvTransposeOp paddings dimension and Conv strides "
+                    "ConvTransposeOp paddings dimension and strides "
                     "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                     "In ConvTransposeOp, The input channel should be the same "
@@ -52,7 +47,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
                            filter_dims[i + 2]);
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
@@ -67,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
       "The format of input tensor is NCHW. Where N is batch size, C is the "
       "number of input channels, H is the height of the feature, and "
       "W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator. "
-           "The format of the filter tensor is CMHW, where C is the number of "
-           "output image channels, M is the number of input image channels, "
-           "H is the height of the filter, and W is the width of the filter. "
-           "We enforce groups number == 1 and padding == 0 in "
-           "the convolution transpose scenario.");
+  AddInput(
+      "Filter",
+      "(Tensor) The filter tensor of convolution transpose operator. "
+      "The format of the filter tensor is MCHW, where M is the number of "
+      "input feature channels, C is the number of "
+      "output feature channels,"
+      "H is the height of the filter, and W is the width of the filter. "
+      "We enforce groups number == 1 in the convolution transpose scenario.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
   AddAttr<std::vector<int>>(
       "strides",
-      "(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of "
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
       "convolution transpose operator.")
       .SetDefault({1, 1});
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
       "transpose operator.")
       .SetDefault({0, 0});
   AddComment(R"DOC(
@@ -93,21 +89,26 @@ Convolution2D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
-size, C is the number of channels, H is the height of the feature, and 
-W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, H is the height of the filter,
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
 The input(X) size and output(Out) size may be different.
+
 Example:
   Input:
-       Input shape: (N, C_in, H_in, W_in)
-       Filter shape: (C_in, C_out, H_f, W_f)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, H_out, W_out)
-  where
-       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
+  $$
 )DOC");
 }
 
@@ -122,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
            "W is the width of the feature.");
   AddInput("Filter",
            "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is CMDHW, where C is the number of "
-           "output image channels, M is the number of input image channels, D "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "input feature channels, C is the number of "
+           "output feature channels, D "
            "is the depth of the filter, H is the height of the filter, and "
            "W is the width of the filter."
            "We enforce groups number == 1 and padding == 0 in "
@@ -135,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
   AddAttr<std::vector<int>>("strides",
-                            "(vector<int> defalut:{1, 1, 1}), the "
+                            "(vector<int> default:{1, 1, 1}), the "
                             "strides{d_stride, h_stride, w_stride} of "
                             "convolution transpose operator.")
       .SetDefault({1, 1, 1});
   AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> defalut:{0, 0, 0}), paddings(d_pad, "
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                             "h_pad, w_pad) of convolution transpose operator.")
       .SetDefault({0, 0, 0});
   AddComment(R"DOC(
@@ -149,23 +151,28 @@ Convolution3D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
-size, C is the number of channels, D is the depth of the feature, 
-H is the height of the feature, and W is the width of the feature. 
-Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
+number of channels, D is the depth of the feature, H is the height of the feature,
+and W is the width of the feature.
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, D is the depth of the filter,H is the
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
-Example:
+
+Example:   
   Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
-       Filter shape: (C_in, C_out, D_f, H_f, W_f)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
-  where
-       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
-       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
+  $$
 )DOC");
 }
 
@@ -190,17 +197,21 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
 
 REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
             conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
index 401cddb379ced134b800d2a078fe130a2850fbb2..4165eb0c7b048b83bbd94c57b971530043b66545 100644
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -18,14 +18,18 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
 
 REGISTER_OP_GPU_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 4b2bd60437da8f58054d8cdd5e6ba1fdac05f0d5..1cacb770e6af3ad3c99ab81c5598ffcd228f59b2 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -62,37 +62,31 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     Tensor* output = context.Output<Tensor>("Output");
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    // TODO(Zhuoyuan): Paddings can be added in future.
     // groups will alway be disabled in conv2dtranspose.
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
-    // input_shape_vec: {h, w} or {d, h, w}
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
     std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-    input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
-
-    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-    filter_shape_vec.erase(filter_shape_vec.begin(),
-                           filter_shape_vec.begin() + 2);
 
     // use col_shape in the im2col and col2im (or vol2col and col2vol)
     // calculation
     // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
-    std::vector<int64_t> col_shape_vec;
-    col_shape_vec.push_back(output->dims()[1]);
-    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-                         filter_shape_vec.end());
-    col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
-                         input_shape_vec.end());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
     DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
     // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
 
     Tensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
@@ -137,7 +131,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
                              input_batch, false, static_cast<T>(1.0),
                              &col_matrix, static_cast<T>(0.0));
 
-      if (filter_shape_vec.size() == 2) {
+      if (data_dim == 2U) {
         // col2im: col_matrix -> dy
         // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
         col2im(context.device_context(), col,
@@ -145,11 +139,11 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
                std::vector<int>{paddings[0], paddings[1], paddings[0],
                                 paddings[1]},
                &output_batch);
-      } else if (filter_shape_vec.size() == 3) {
+      } else if (data_dim == 3U) {
         // col2vol: col_matrix -> dy
         // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(context.device_context(), col, dilations, strides,
-                std::vector<int>{0, 0, 0}, &output_batch);
+        col2vol(context.device_context(), col, dilations, strides, paddings,
+                &output_batch);
       }
     }
   }
@@ -173,35 +167,30 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     if ((!input_grad) && (!filter_grad)) return;
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
-    // input_shape_vec: {h, w} or {d, h, w}
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
     std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-    input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
-
-    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
     std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-    filter_shape_vec.erase(filter_shape_vec.begin(),
-                           filter_shape_vec.begin() + 2);
 
     // use col_shape in the im2col and col2im (or vol2col and col2vol)
     // calculation
     // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
-    std::vector<int64_t> col_shape_vec;
-    col_shape_vec.push_back(output_grad->dims()[1]);
-    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-                         filter_shape_vec.end());
-    col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
-                         input_shape_vec.end());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output_grad->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
     DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
     // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
 
     // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
     DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
@@ -250,7 +239,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
         Tensor output_grad_batch =
             output_grad->Slice(i, i + 1).Resize(output_shape);
 
-        if (filter_shape_vec.size() == 2) {
+        if (data_dim == 2U) {
           // im2col: dy -> col matrix
           // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
           im2col(context.device_context(), output_grad_batch,
@@ -258,7 +247,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
                  std::vector<int>{paddings[0], paddings[1], paddings[0],
                                   paddings[1]},
                  &col);
-        } else if (filter_shape_vec.size() == 3) {
+        } else if (data_dim == 3U) {
           // vol2col: dy -> col_matrix
           // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
           vol2col(context.device_context(), output_grad_batch, dilations,
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f6bdc63cc2cfae526fe911ee4d989675452d5c5d
--- /dev/null
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -0,0 +1 @@
+grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89dc5045221156eed7aa9411bc96ad86f91136d2
--- /dev/null
+++ b/paddle/operators/detail/recv_impl.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+Status SendRecvServerImpl::SendVariable(ServerContext *context,
+                                        const VariableMessage *in_var,
+                                        VariableMessage *out_var) {
+  framework::LoDTensor t;
+  // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+  std::istringstream iss(in_var->serialized());
+  framework::DeserializeFromStream(iss, &t);
+  lodtensor_queue_.Push(std::move(t));
+  // Block util the sub graph is done.
+  t = lodtensor_return_queue_.Pop();
+  std::ostringstream oss;
+  // FIXME(typhoonzero): get context from op.
+  framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+  std::string *varname = out_var->mutable_varname();
+  *varname = in_var->varname();
+  std::string *serialized = out_var->mutable_serialized();
+  *serialized = oss.str();
+
+  return Status::OK;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da1ddf75d2afb85670c5ea0c9884376415f28208
--- /dev/null
+++ b/paddle/operators/detail/send_impl.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::SendVariable(const framework::Scope& scope,
+                             const std::string& inname,
+                             const std::string& outname) {
+  ClientContext context;
+  VariableMessage msg, out_msg;
+  // FIXME(typhoonzero): pass device context to here.
+  auto ctx = platform::CPUDeviceContext();
+  auto* var = scope.FindVar(inname);
+  PADDLE_ENFORCE(var);
+  // TODO(typhoonzero): support SelectedRows
+  PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                 "Only support LoDTensor, %s has wrong type", inname);
+  const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
+  std::ostringstream oss;
+  framework::SerializeToStream(oss, tensor, ctx);
+  msg.set_varname(inname);
+  msg.set_serialized(oss.str());
+  Status status = stub_->SendVariable(&context, msg, &out_msg);
+  if (!status.ok()) {
+    return false;
+  }
+  std::istringstream iss(out_msg.serialized());
+  framework::LoDTensor ret_tensor;
+  framework::DeserializeFromStream(iss, &ret_tensor);
+  auto* outvar = scope.FindVar(outname);
+  framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
+  // FIXME(typhoonzero): do not copy.
+  framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
new file mode 100644
index 0000000000000000000000000000000000000000..962c7d59819dede022474aec4a2d7f538d28c688
--- /dev/null
+++ b/paddle/operators/detail/send_recv.proto
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+syntax = "proto3";
+
+package sendrecv;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// Tensor
+// LoDTensor
+// SelectedRows
+message VariableMessage {
+  string varname = 1;
+  bytes serialized = 2;
+}
+
+message VoidMessage {}
\ No newline at end of file
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9a5340a8636db7b5d6ec7b21368632d3916b4aa
--- /dev/null
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+// #include <grpc++/channel.h>
+// #include <grpc++/client_context.h>
+// #include <grpc++/create_channel.h>
+// #include <grpc++/security/credentials.h>
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+
+using grpc::Channel;
+using grpc::Server;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerBuilder;
+
+using grpc::ClientContext;
+using grpc::ClientReader;
+using grpc::ClientReaderWriter;
+using grpc::ClientWriter;
+using grpc::Status;
+using sendrecv::SendRecvService;
+using sendrecv::VariableMessage;
+using sendrecv::VoidMessage;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class SendRecvServerImpl final : public SendRecvService::Service {
+ public:
+  explicit SendRecvServerImpl() {}
+
+  Status SendVariable(ServerContext *context, const VariableMessage *in_var,
+                      VariableMessage *out_var) override;
+
+  const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
+
+  void Push(const framework::LoDTensor &tensor) {
+    this->lodtensor_return_queue_.Push(tensor);
+  }
+
+ private:
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
+};
+
+// RPCClient is a class to send tensors to pserver sub-network
+// using different hashing methods.
+class RPCClient {
+ public:
+  RPCClient(std::shared_ptr<Channel> channel)
+      : stub_(SendRecvService::NewStub(channel)) {}
+
+  bool SendVariable(const framework::Scope &scope, const std::string &inname,
+                    const std::string &outname);
+
+ private:
+  std::unique_ptr<SendRecvService::Stub> stub_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..44899217579532af2c1d2e6074ec0e08231e7b86
--- /dev/null
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T>
+class SimpleBlockQueue {
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  std::deque<T> queue_;
+
+ public:
+  void Push(T const& value) {
+    {
+      std::unique_lock<std::mutex> lock(this->mutex_);
+      queue_.push_front(value);
+    }
+    this->condition_.notify_one();
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
+    T rc(std::move(this->queue_.back()));
+    this->queue_.pop_back();
+    return rc;
+  }
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 818146aca766cb13b93fd024c11c1209655d9e11..932c0bf8fbf6ffdc466516bb7c8578abf0f57209 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -30,7 +30,7 @@ class DropoutOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_training") == true) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("Mask", x_dims);
     }
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -49,7 +49,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
         .SetDefault(.5f);
-    AddAttr<bool>("is_training", "True if in training phase.").SetDefault(true);
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
 
     AddComment(R"DOC(
@@ -71,8 +71,8 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), true,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      "GradOp is only callable when is_test is false");
 
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index 30c769000f2b98c69eaa78a4c139630dd0956386..db3578b9bf4c081e431f202f0828ec6392c924b2 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -59,7 +59,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
 
     auto place = context.GetEigenDevice<Place>();
-    if (context.Attr<bool>("is_training")) {
+    if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int size = framework::product(mask->dims());
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index 6000b75fecdff74844605215e9364ac8f8a1525a..d9a130fdc040f745b058c39221f0bb9661473388 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -35,7 +35,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
-    if (context.Attr<bool>("is_training")) {
+    if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int seed = context.Attr<int>("seed");
@@ -65,8 +65,8 @@ template <typename Place, typename T>
 class DropoutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(context.Attr<bool>("is_training"),
-                   "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(!context.Attr<bool>("is_test"),
+                   "GradOp is only callable when is_test is false");
 
     auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
     auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc
deleted file mode 100644
index d48cc4e8df587708ab93e7d788145adc01c1d3e5..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve .
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/dynamic_recurrent_op.h"
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Scope;
-using framework::TensorArray;
-using framework::LoDTensor;
-using framework::Variable;
-using framework::OperatorBase;
-using framework::DySeqMetaBatch;
-
-namespace detail {
-
-inline void CreateVariables(Scope& scope,
-                            const std::vector<std::string>& var_names) {
-  for (const auto& name : var_names) {
-    scope.Var(name);
-  }
-}
-
-/*
- * The inputs with sequence should be reordered when they are split, so the
- * boot_states should be reordered in the same order.
- *
- * NOTE This may require that the `pre_state` of the first time step should just
- * copy the `boot_state` rather than reference it, for that the content should
- * be reordered, but the RNN op should not change the `boot_state` as an input
- * variable's content.
- */
-inline void ReorderInitialState(const DySeqMetaBatch& metas,
-                                const LoDTensor& boot_state, LoDTensor* tensor,
-                                const platform::Place& dst_place) {
-  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor->Slice(seq_id, seq_id + 1);
-    auto boot_slice =
-        boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
-    // TODO(superjom) pass in device context as an argument
-    slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext());
-  }
-}
-
-inline void RestoreInitialState(const DySeqMetaBatch& metas,
-                                const LoDTensor& tensor, LoDTensor* boot_state,
-                                const platform::Place& dst_place) {
-  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor.Slice(seq_id, seq_id + 1);
-    auto boot_slice =
-        boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
-    boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext());
-  }
-}
-
-}  // namespace detail
-
-// Implementation for forward propagation.
-template <>
-void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kForward>(
-    const framework::Scope& scope, const framework::OperatorBase& op,
-    const platform::DeviceContext& dev_ctx) {
-  SetComputeMode(ComputeMode::kForward);
-  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
-  SplitInputs();
-  CreateScopes();
-  WriteStepInputs();
-  InitStates();
-  WriteStepOutputs();
-  RunSteps();
-  ConcatOutputs();
-}
-
-// Implementation for backward propagation.
-template <>
-void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kBackward>(
-    const framework::Scope& scope, const framework::OperatorBase& op,
-    const platform::DeviceContext& dev_ctx) {
-  SetComputeMode(ComputeMode::kBackward);
-  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
-  SplitInputs();
-  WriteStepInputs();
-  InitStates();
-  WriteStepOutputs();
-  RunSteps();
-  // copy boot-states' gradients back.
-  for (const auto& state : arg_.states) {
-    ExportInitialStateGradient(state);
-  }
-
-  ConcatOutputs();
-}
-
-void RNNAlgorithm::SplitInputs() {
-  // TODO(superjom) make level a config
-  // TODO(superjom) check all the inputs has the same LoD
-  int level = 0;
-  for (const auto& item : cache_.inputs) {
-    const auto& var = item.second;
-    const auto& tensor = var->Get<LoDTensor>();
-    TensorArray& ta = step_inputs_[item.first];
-
-    dy_seq_metas_[item.first] =
-        ta.Unpack(tensor, level, true /*length_descend*/);
-
-    if (cache_.num_steps) {
-      PADDLE_ENFORCE_EQ(ta.size(), cache_.num_steps,
-                        "inputs should have the same steps");
-    } else {
-      cache_.num_steps = ta.size();
-    }
-  }
-}
-
-void RNNAlgorithm::WriteStepInputs() {
-  for (const auto& item : cache_.inputs) {
-    auto ta_it = step_inputs_.find(item.first);
-    PADDLE_ENFORCE(ta_it != step_inputs_.end(),
-                   "step_inputs_ not compatible with memory set");
-    TensorArray& ta = ta_it->second;
-    for (size_t step = 0; step < ta.size(); step++) {
-      auto tensor = ta.Read(step);
-      auto& step_scope = cache_.GetScope(step);
-      Variable* var = step_scope.FindVar(item.first);
-      if (var == nullptr) {
-        var = step_scope.Var(item.first);
-      }
-      var->GetMutable<LoDTensor>()->ShareDataWith(tensor);
-    }
-  }
-}
-
-void RNNAlgorithm::WriteStepOutputs() {
-  // initialize step outputs
-  for (const auto& item : cache_.outputs) {
-    step_outputs_.emplace(item.first, TensorArray());
-  }
-  PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
-}
-
-void RNNAlgorithm::CreateScopes() {
-  PADDLE_ENFORCE_GT(cache_.num_steps, 0);
-  // resize scopes
-  size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size();
-  for (size_t i = 0; i < num_scopes_need_create; i++) {
-    cache_.scopes->emplace_back(&cache_.scope->NewScope());
-  }
-
-  // init temporary inputs
-  PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first");
-  std::vector<std::string> states;
-  std::vector<std::string> ex_states;
-  std::vector<std::string> step_unit_outputs;
-  std::transform(arg_.states.begin(), arg_.states.end(),
-                 std::back_inserter(states),
-                 [](const rnn::StateAttr& m) { return m.var; });
-  std::transform(arg_.states.begin(), arg_.states.end(),
-                 std::back_inserter(ex_states),
-                 [](const rnn::StateAttr& m) { return m.pre_var; });
-  for (const auto& item : step_unit_->Outputs()) {
-    for (const auto& var : item.second) {
-      step_unit_outputs.push_back(var);
-    }
-  }
-
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& scope = cache_.GetScope(step);
-    detail::CreateVariables(scope, arg_.inlinks);
-    detail::CreateVariables(scope, arg_.outlinks);
-    detail::CreateVariables(scope, states);
-    detail::CreateVariables(scope, ex_states);
-    detail::CreateVariables(scope, step_unit_outputs);
-  }
-}
-
-void RNNAlgorithm::ConcatOutputs() {
-  // TODO(superjom) transform this to a config
-  int level = 0;
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& scope = cache_.GetScope(step);
-    for (auto& item : step_outputs_) {
-      auto* var = scope.FindVar(item.first);
-      PADDLE_ENFORCE_NOT_NULL(var);
-      auto* tensor = var->GetMutable<LoDTensor>();
-      tensor->mutable_data<value_type>(platform::CPUPlace());
-      item.second.WriteShared(step, *tensor);
-    }
-  }
-  // the inputs' lods should be the same, so randomly get one lod.
-  const auto& some_lod =
-      cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  for (auto& item : step_outputs_) {
-    auto tensor = item.second.Pack(level, some_meta, some_lod);
-    auto* output = cache_.outputs[item.first]->GetMutable<LoDTensor>();
-    const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
-  }
-}
-
-void RNNAlgorithm::RunSteps() {
-  if (IsBackward()) {
-    // call stepnet in all the time steps reversely
-    for (int step = cache_.num_steps - 1; step >= 0; step--) {
-      auto& step_scope = cache_.GetScope(step);
-      step_unit_->Run(step_scope, *cache_.dev_ctx);
-    }
-  } else {
-    for (size_t step = 0; step < cache_.num_steps; step++) {
-      auto& step_scope = cache_.GetScope(step);
-      step_unit_->Run(step_scope, *cache_.dev_ctx);
-    }
-  }
-}
-
-void RNNAlgorithm::InitStates() {
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    for (const auto& state : arg_.states) {
-      CreateState(state, step);
-      LinkState(state, step);
-    }
-  }
-}
-
-void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) {
-  auto& scope = cache_.GetScope(step);
-  auto& state = *cache_.GetTensor(scope, state_attr.var);
-  auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var);
-
-  size_t num_instances =
-      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
-  auto dims = boot_state.dims();
-  dims[0] = num_instances;
-
-  state.Resize(dims);
-  state.mutable_data<value_type>(platform::CPUPlace());
-  states_[state_attr.var].WriteShared(step, state);
-}
-
-void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) {
-  auto& scope = cache_.GetScope(step);
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-
-  // process the first state's boot-state(the 0-step in forward mode or the
-  // last step in backward mode)
-  // Only forward mode need to link the boot-state to the `pre-state` in first
-  // time step. In backward mode, need to copy the gradient of `pre-state` in
-  // first time step to the gradient of `boot-state`.
-  if (step == 0 && IsForward()) {
-    LinkInitialState(state);
-  } else {
-    size_t num_instances =
-        step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
-    auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var);
-    // shink and share from previous state
-    auto shrinked_pre_state = pre_state->Slice(0, num_instances);
-    state_pre.ShareDataWith(shrinked_pre_state);
-  }
-}
-
-void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) {
-  // all the step_inputs' metas should be the same, just randomly select one
-  // and get the dyseq meta.
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  auto& scope = cache_.GetScope(0);
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-  auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var);
-  pre_state->mutable_data<float>(platform::CPUPlace());
-  // allocate state
-  state_pre.Resize(pre_state->dims());
-  state_pre.mutable_data<value_type>(platform::CPUPlace());
-  detail::ReorderInitialState(some_meta, *pre_state, &state_pre,
-                              pre_state->place());
-}
-
-void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) {
-  // all the step_inputs' metas should be the same, just randomly select one
-  // and get the dyseq meta.
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  auto& scope = cache_.GetScope(0);
-
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-  auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var);
-  pre_state.Resize(state_pre.dims());
-  detail::RestoreInitialState(some_meta, state_pre, &pre_state,
-                              pre_state.place());
-}
-
-void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name,
-                                  const paddle::framework::OperatorBase& op,
-                                  const paddle::framework::Scope& scope,
-                                  platform::DeviceContext const* dev_ctx,
-                                  rnn::Argument* arg) {
-  this->scope = &scope;
-  InitArgument(name, op, arg);
-  CacheScopes(scope, *arg);
-  CacheInlinks(scope, arg->inlinks);
-  CacheOutlinks(scope, arg->outlinks);
-  this->dev_ctx = dev_ctx;
-}
-
-void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name,
-                                          const OperatorBase& op,
-                                          rnn::Argument* arg) {
-  rnn::InitArgument(name, arg, op, false /*is_grad*/);
-}
-
-void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope,
-                                         const rnn::Argument& arg) {
-  auto scopes_var = scope.FindVar(arg.step_scopes);
-  PADDLE_ENFORCE(scopes_var != nullptr,
-                 "the step_scopes output argument [%s] should be created first "
-                 "by framework.",
-                 arg.step_scopes);
-  this->scopes = scopes_var->GetMutable<std::vector<Scope*>>();
-}
-
-void RNNAlgorithm::ArgCache::CacheInlinks(
-    const Scope& scope, const std::vector<std::string>& names) {
-  for (auto name : names) {
-    auto* var = GetVariable(scope, name);
-    inputs[name] = var;
-  }
-}
-
-void RNNAlgorithm::ArgCache::CacheOutlinks(
-    const Scope& scope, const std::vector<std::string>& names) {
-  for (auto name : names) {
-    auto* var = GetVariable(scope, name);
-    outputs[name] = var;
-  }
-}
-
-Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope,
-                                              const std::string& name) {
-  auto* var = scope.FindVar(name);
-  PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name);
-  return var;
-}
-
-LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope,
-                                             const std::string& name) {
-  auto* var = GetVariable(scope, name);
-  return var->GetMutable<LoDTensor>();
-}
-
-const std::array<rnn::ArgumentName, 2> RNNAlgorithm::kArgNames{
-    {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs",
-                       "states", "ex_states", "initial_states"},
-     rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD",
-                       "inputs@GRAD", "states", "ex_states",
-                       "initial_states@GRAD"}}};
-
-void DynamicRecurrentOp::Run(const framework::Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
-  rnn.Run<RNNAlgorithm::ComputeMode::kForward>(
-      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
-}
-
-void DynamicRecurrentGradientOp::Run(
-    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
-  rnn.Run<RNNAlgorithm::ComputeMode::kBackward>(
-      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
-}
-
-class DynamicRecurrentOpProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
-                                         framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    const auto& name =
-        RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward];
-    // inputs and outputs stored in proto
-    AddInput(name.inlinks,
-             "The inputs that need to be segmented for each step.")
-        .AsDuplicable();
-    AddInput(name.initial_states, "Variables to initialize the states.")
-        .AsDuplicable();
-
-    AddOutput(name.outlinks,
-              "The outputs that need to be concatenated for all steps.")
-        .AsDuplicable();
-    AddOutput(name.step_scopes, "step scopes");
-
-    // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.ex_states, "names of ex_states");
-    AddAttr<std::vector<std::string>>(name.states, "names of states");
-
-    AddComment(R"DOC(
-Dynamic Recurrent Operator.
-
-This is a RNN operator for varience-length sequences.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
-            paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker,
-            dynamic_recurrent_grad,
-            paddle::operators::DynamicRecurrentGradientOp);
diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h
deleted file mode 100644
index 5b0548c3a44c9f58838ecc567ee41a587883c26a..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor_array.h"
-#include "paddle/framework/variable.h"
-#include "paddle/operators/rnn/recurrent_op_utils.h"
-
-namespace paddle {
-namespace operators {
-
-class RNNAlgorithm {
- public:
-  enum ComputeMode { kForward = 0, kBackward = 1 };
-  static const std::array<rnn::ArgumentName, 2> kArgNames;
-  using value_type = float;
-
-  /*
-   * Different `Run` method for forward and backward, `_` is just for template
-   * specifialization.
-   */
-  template <ComputeMode _>
-  void Run(const framework::Scope& scope, const framework::OperatorBase& op,
-           const platform::DeviceContext& dev_ctx);
-  /*
-   * Split the inputs(LoDTensors) to segments for each time step.
-   */
-  void SplitInputs();
-
-  /*
-   * Create step-scopes to store temporary outputs in each time steps.
-   */
-  void CreateScopes();
-
-  /*
-   * Link TensorArray steps to the corresponding variables located in
-   * step-scopes.
-   */
-  void WriteStepInputs();
-
-  /*
-   * Write output of each step to the corresponding TensorArray.
-   */
-  void WriteStepOutputs();
-
-  /*
-   * Initialize the states, each state will have a corresponding pre-state,
-   * which share the memory with the state in the previous time state. The
-   * pre-state in the first time step will be initialized with an zero tensor or
-   * a tensor in parent scope if is provided.
-   */
-  void InitStates();
-
-  /*
-   * Create state variables for each time step.
-   */
-  void CreateState(const rnn::StateAttr& state, size_t step);
-
-  /*
-   * Link pre-state variable in current scope to the state variable in the
-   * previous time step (scope) by reference.
-   */
-  void LinkState(const rnn::StateAttr& state, size_t step);
-
-  /*
-   * Link the pre-state of the first time step to the `boot-state` in parent's
-   * scope.
-   */
-  void LinkInitialState(const rnn::StateAttr& state);
-
-  /*
-   * Copy the gradient from `pre-state` in the first step-scope to the
-   * `boot-state` in parent's scope.
-   */
-  void ExportInitialStateGradient(const rnn::StateAttr& state);
-
-  /*
-   * Calculate time steps.
-   */
-  void RunSteps();
-
-  /*
-   * Concatenate outputs in each time step and generate a LoDTensor.
-   */
-  void ConcatOutputs();
-
-  void SetComputeMode(ComputeMode mode) { mode_ = mode; }
-  bool IsForward() const { return mode_ == ComputeMode::kForward; }
-  bool IsBackward() const { return mode_ == ComputeMode::kBackward; }
-
-  /*
-   * set a step unit that is created according to a RecurrentOp's step unit.
-   */
-  void SetStepUnit(std::unique_ptr<framework::OperatorBase> step_unit) {
-    PADDLE_ENFORCE_NOT_NULL(step_unit);
-    step_unit_ = std::move(step_unit);
-  }
-  const framework::OperatorBase& GetStepUnit() const { return *step_unit_; }
-
-  const framework::TensorArray& state(const std::string& name) const {
-    auto it = states_.find(name);
-    PADDLE_ENFORCE(it != states_.end());
-    return it->second;
-  }
-  const framework::TensorArray& step_input(const std::string& name) const {
-    auto it = step_inputs_.find(name);
-    PADDLE_ENFORCE(it != step_inputs_.end());
-    return it->second;
-  }
-  const framework::TensorArray& step_output(const std::string& name) const {
-    auto it = step_outputs_.find(name);
-    PADDLE_ENFORCE(it != step_outputs_.end());
-    return it->second;
-  }
-
- protected:
-  struct ArgCache {
-    framework::Scope const* scope;
-    std::vector<framework::Scope*>* scopes;
-    std::map<std::string, framework::Variable*> inputs;
-    std::map<std::string, framework::Variable*> outputs;
-    platform::DeviceContext const* dev_ctx;
-
-    size_t num_steps{0};
-
-    void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op,
-              const framework::Scope& scope,
-              platform::DeviceContext const* dev_ctx, rnn::Argument* arg);
-
-    framework::Scope& GetScope(size_t index) {
-      PADDLE_ENFORCE_LT(index, num_steps);
-      return *scopes->at(index);
-    }
-
-    framework::LoDTensor* GetTensor(const framework::Scope& scope,
-                                    const std::string& name);
-
-   private:
-    void InitArgument(const rnn::ArgumentName& name,
-                      const framework::OperatorBase& op, rnn::Argument* arg);
-    void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg);
-    void CacheInlinks(const framework::Scope& scope,
-                      const std::vector<std::string>& names);
-    void CacheOutlinks(const framework::Scope& scope,
-                       const std::vector<std::string>& names);
-    framework::Variable* GetVariable(const framework::Scope& scope,
-                                     const std::string& name);
-  };
-
- private:
-  std::unique_ptr<framework::OperatorBase> step_unit_;
-  std::map<std::string, framework::TensorArray> states_;
-  std::map<std::string, framework::TensorArray> step_inputs_;
-  std::map<std::string, framework::TensorArray> step_outputs_;
-  std::map<std::string, std::vector<framework::DySeqMeta>> dy_seq_metas_;
-  rnn::Argument arg_;
-  ArgCache cache_;
-  ComputeMode mode_{ComputeMode::kForward};
-
-#ifdef PADDLE_WITH_TESTING
-  // test forward
-  friend class RNNAlgorithmTestHelper;
-  FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache);
-  FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes);
-  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, InitStates);
-  FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs);
-// TODO(superjom) test backward
-#endif
-};
-
-class DynamicRecurrentOp : public framework::OperatorBase {
- public:
-  DynamicRecurrentOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentOp(const DynamicRecurrentOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
-  mutable RNNAlgorithm rnn;
-};
-
-class DynamicRecurrentGradientOp : public framework::OperatorBase {
- public:
-  DynamicRecurrentGradientOp(const std::string& type,
-                             const framework::VariableNameMap& inputs,
-                             const framework::VariableNameMap& outputs,
-                             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
-  mutable RNNAlgorithm rnn;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc
deleted file mode 100644
index 8d840e259b190ead86a66df8ab31c5170db4d824..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-#include "paddle/operators/dynamic_recurrent_op.h"
-
-#include <gtest/gtest.h>
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Scope;
-using framework::TensorArray;
-using framework::LoDTensor;
-using framework::Variable;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
-};
-
-void OpDescNewVar(const std::string& param_name,
-                  std::initializer_list<const char*> arguments,
-                  paddle::framework::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    var->add_arguments(arg_name);
-  }
-}
-
-// create a LoD tensor in scope with specific dims
-LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
-                     const platform::Place& place) {
-  auto* var = scope.Var(name);
-  auto* tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize(dims);
-  tensor->mutable_data<float>(place);
-  return tensor;
-}
-
-class RNNAlgorithmTestHelper : public ::testing::Test {
- protected:
-  const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0];
-
-  virtual void SetUp() override {
-    CreateGlobalVariables();
-
-    auto op_desc = CreateOpDesc();
-    op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    dop = &(dynamic_cast<DynamicRecurrentOp*>(op.get())->rnn);
-    InitCacheManually();
-    InitStepNet();
-  }
-
-  framework::OpDesc CreateOpDesc() {
-    // create op
-    paddle::framework::OpDesc op_desc;
-    op_desc.set_type("dynamic_recurrent");
-
-    OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs());
-    OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs());
-    OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs());
-    OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs());
-
-    // set pre-states
-    auto pre_memories = op_desc.mutable_attrs()->Add();
-    pre_memories->set_name(argname.ex_states);
-    pre_memories->set_type(paddle::framework::AttrType::STRINGS);
-    auto pre_memories_item = pre_memories->add_strings();
-    *pre_memories_item = "mem@pre";
-
-    // set states
-    auto memories = op_desc.mutable_attrs()->Add();
-    memories->set_name(argname.states);
-    memories->set_type(paddle::framework::AttrType::STRINGS);
-    auto memories_item = memories->add_strings();
-    *memories_item = "mem";
-    return op_desc;
-  }
-
-  void CreateGlobalVariables() {
-    platform::CPUPlace place;
-    scope.Var("step_scopes");
-    CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place);
-    CreateVar(scope, "out0", framework::make_ddim({10, 20}), place);
-    auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place);
-    // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively.
-    framework::LoD in0_lod(1);
-    for (int x : std::vector<int>{0, 4, 7, 9, 10}) {
-      in0_lod[0].push_back(x);
-    }
-    in0->set_lod(in0_lod);
-    in0->Resize(framework::make_ddim({10, 8}));
-    // set the content, each sentence content is seqid.batchid
-    // the seqid starts from 0
-    int start = 0;
-    for (size_t seqid = 0; seqid < in0_lod.size() - 1; seqid++) {
-      for (size_t batchid = 0;
-           batchid < in0_lod[0][seqid + 1] - in0_lod[0][seqid]; batchid++) {
-        float v = seqid + batchid * 0.1;
-
-        for (size_t dim = 0; dim < 8; dim++) {
-          in0->data<float>()[start * 8 + dim] = v;
-        }
-        start++;
-      }
-    }
-  }
-
-  void InitCacheManually() {
-    dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context,
-                     &dop->arg_);
-  }
-
-  void InitStepNet() {
-    std::unique_ptr<framework::OperatorBase> stepnet{new NetOp};
-    dynamic_cast<NetOp*>(stepnet.get())
-        ->AppendOp(std::unique_ptr<TestOp>(new TestOp(
-            "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}},
-            {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
-    dop->SetStepUnit(std::move(stepnet));
-  }
-
- protected:
-  RNNAlgorithm* dop;
-  std::unique_ptr<framework::OperatorBase> op;
-  paddle::platform::CPUDeviceContext device_context;
-  paddle::framework::Scope scope;
-};
-
-TEST_F(RNNAlgorithmTestHelper, CreateCache) {
-  const rnn::Argument& arg = dop->arg_;
-  ASSERT_EQ(arg.inlinks.size(), 1UL);
-  ASSERT_EQ(arg.outlinks.size(), 1UL);
-}
-
-TEST_F(RNNAlgorithmTestHelper, SplitInputs) {
-  dop->SplitInputs();
-  auto& in0_ta = dop->step_inputs_["in0"];
-  ASSERT_EQ(in0_ta.size(), 4UL);
-
-  const auto& batch0 = in0_ta.Read(0);
-  const auto& batch1 = in0_ta.Read(1);
-  const auto& batch2 = in0_ta.Read(2);
-  const auto& batch3 = in0_ta.Read(3);
-  EXPECT_EQ(batch0.dims()[0], 4);
-  EXPECT_EQ(batch1.dims()[0], 3);
-  EXPECT_EQ(batch2.dims()[0], 2);
-  EXPECT_EQ(batch3.dims()[0], 1);
-}
-
-TEST_F(RNNAlgorithmTestHelper, CreateScopes) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  ASSERT_EQ(dop->cache_.num_steps, 4UL);
-  ASSERT_EQ(dop->cache_.scopes->size(), 4UL);
-}
-
-TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    for (auto name : std::vector<std::string>({"in0"})) {
-      ASSERT_TRUE(scope.FindVar(name) != nullptr);
-    }
-  }
-}
-
-TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-  dop->WriteStepOutputs();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    for (auto name : std::vector<std::string>({"out0"})) {
-      ASSERT_TRUE(scope.FindVar(name));
-    }
-  }
-}
-
-TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) {
-  // Let's leave this test to python unittest.
-}
-
-TEST_F(RNNAlgorithmTestHelper, InitStates) {
-  dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward);
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-  dop->WriteStepOutputs();
-  dop->InitStates();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    auto state = scope.FindVar("mem");
-    ASSERT_TRUE(state != nullptr);
-
-    auto* pre_state = scope.FindVar("mem@pre");
-    ASSERT_TRUE(pre_state != nullptr);
-
-    auto* boot_state = scope.FindVar("boot_mem");
-    ASSERT_TRUE(boot_state != nullptr);
-  }
-}
-
-}  // operators
-}  // namespace paddle
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
index 8ae2c11a5d31dafc1b90d129054ebfabfb761bfe..4d7996ad1e744fead1329c35ce6ea43bf0683ce6 100644
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
@@ -125,7 +125,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
       auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
       out0->mutable_data<T>(context.GetPlace());
-      out0->CopyFrom(*in0, context.GetPlace(), context.device_context());
+      framework::CopyFrom(*in0, context.GetPlace(), context.device_context(),
+                          out0);
     } else {
       switch (dims) {
         REP_EXPAND_GRAD_TEMPLATE(72)
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 0dd84cbeaafbafd45132b0a0b744554ce7475411..ee43c22fb13e203c7de1a7e6d1586423fcbfb25a 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -47,7 +47,7 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx);
+    framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
   }
 };
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 8108ae69dec4bafd1c04d5ab05eef6f467d4c6e8..1ae07194c235ce6724f59c9c60df80f957787cda 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -51,7 +51,7 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);
+    CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
     dst_item.set_lod(src_item.lod());
 
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 985b5d1e865e513d833bff72dcd20a8f20851d8c..892922cd3aaec8bf8194320c5c3a0dd0365bb589 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -52,7 +52,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
+        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
@@ -63,7 +63,7 @@ class FillConstantBatchSizeLikeOpMaker
   FillConstantBatchSizeLikeOpMaker(framework::OpProto *proto,
                                    framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 818f113b90a4c239a857791fb9957e51d3287b97..3d5f84bc239615797a5cf01a74150fdb7dfc1b80 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -34,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase {
   using framework::OperatorBase::OperatorBase;
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
-    auto data_type = static_cast<framework::DataType>(Attr<int>("data_type"));
+    auto data_type = static_cast<framework::DataType>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
     auto force_cpu = Attr<bool>("force_cpu");
     auto &out =
@@ -55,7 +55,7 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
   FillConstantOpMaker(framework::OpProto *proto,
                       framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb7ae6919623f10a6c4ec98c0e942c1590ac9a7a
--- /dev/null
+++ b/paddle/operators/ftrl_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/ftrl_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FTRLOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"),
+                   "Input(SquaredAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"),
+                   "Input(LinearAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of FTRL should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"),
+                   "Output(SquaredAccumOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"),
+                   "Output(LinearAccumOut) of FTRL should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
+                      "Two input of FTRL Op's dimension must be same.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("SquaredAccumOut", param_dim);
+    ctx->SetOutputDim("LinearAccumOut", param_dim);
+  }
+};
+
+class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FTRLOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("SquaredAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates squared gradients.");
+    AddInput("LinearAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates linear gradients.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("SquaredAccumOut",
+              "(Tensor) Output accumulated squared"
+              " gradients.");
+    AddOutput("LinearAccumOut",
+              "(Tensor) Output accumulated linear"
+              " gradients.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("lr_power",
+                   "(float, default -0.5f) "
+                   "Learning Rate Power.")
+        .SetDefault(-0.5f);
+    AddComment(R"DOC(
+FTRL (Follow The Regularized Leader) Operator.
+
+Optimizer that implements the FTRL algorithm:
+
+$$
+new\_accum = squared\_accum + grad^2 \\
+if (lr\_power == -0.5) {
+   linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) /
+                   (learning\_rate * param) \\
+} else {
+   linear\_accum += grad -
+                  (new\_accum^{-lr\_power} - accum^{-lr\_power}) /
+                  (learning\_rate * param) \\
+}
+
+x = (l1 * sign(linear\_accum) - linear\_accum)
+if (lr\_power == -0.5) {
+   y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+} else {
+   y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+}
+squared\_accum += grad^2;
+$$
+
+The paper that proposed Follow The Regularized Leader (FTRL):
+(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
+REGISTER_OP_CPU_KERNEL(ftrl,
+                       ops::FTRLOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..97b36dade6f531df49615ae2d44d565eadba7154
--- /dev/null
+++ b/paddle/operators/ftrl_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/ftrl_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(ftrl,
+                       ops::FTRLOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b040162f8d1d8998aa13021c10a25fe57135c1e9
--- /dev/null
+++ b/paddle/operators/ftrl_op.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class FTRLOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
+    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    sq_accum_out->mutable_data<T>(ctx.GetPlace());
+    lin_accum_out->mutable_data<T>(ctx.GetPlace());
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+    auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto sq_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("SquaredAccumulator"));
+    auto lin_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("LinearAccumulator"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
+    auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    auto new_accum = sq_accum + g * g;
+    // Special case for lr_power = -0.5
+    if (lr_power == static_cast<T>(-0.5)) {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p;
+    } else {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
+           lr.broadcast(grad_dsize)) *
+              p;
+    }
+
+    auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
+    if (lr_power == static_cast<T>(-0.5)) {
+      auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    } else {
+      auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    }
+
+    s_acc_out.device(place) = sq_accum + g * g;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 53ad86c6c48d1868f4495af51661d91b39a84f0b..254c83e1378a121d99c89d9d8705935b5f06edc8 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -60,7 +60,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
+        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
@@ -88,7 +88,7 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Random seed of generator."
                  "0 means use system wide seed.")
         .SetDefault(0);
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5(FP32)) "
                  "Output data type.")
         .SetDefault(framework::DataType::FP32);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 55e9cc4a98bd6d36ce5d6bb4116039d0ec18b485..564489d3a98b59e3e527be5613a73d23d6dbbf31 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -24,8 +24,17 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+inline void ReorderInitState(const platform::DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
 
 template <typename Place, typename T>
 class GRUKernel : public framework::OpKernel<T> {
@@ -33,7 +42,6 @@ class GRUKernel : public framework::OpKernel<T> {
   void BatchCompute(const framework::ExecutionContext& context) const {
     auto* input = context.Input<LoDTensor>("Input");
     auto* h0 = context.Input<Tensor>("H0");
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
     auto* weight = context.Input<Tensor>("Weight");
     const T* weight_data = weight->data<T>();
     auto* bias = context.Input<Tensor>("Bias");
@@ -63,10 +71,21 @@ class GRUKernel : public framework::OpKernel<T> {
 
     int frame_size = hidden_dims[1];
     math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    gru_value.prevOutValue = const_cast<T*>(h0_data);
+    Tensor ordered_h0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
     for (size_t n = 0; n < num_batch; n++) {
@@ -77,14 +96,14 @@ class GRUKernel : public framework::OpKernel<T> {
       Tensor gate_t = batch_gate->Slice(bstart, bend);
       Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
       Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.outputValue = hidden_t.data<T>();
-      gru_value.gateValue = gate_t.data<T>();
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
       math::GRUUnitFunctor<Place, T>::compute(
           dev_ctx, gru_value, frame_size, cur_batch_size,
           math::ActiveType(context.Attr<std::string>("activation")),
           math::ActiveType(context.Attr<std::string>("gate_activation")));
-      gru_value.prevOutValue = gru_value.outputValue;
+      gru_value.prev_out_value = gru_value.output_value;
     }
 
     math::Batch2LoDTensorFunctor<Place, T> to_seq;
@@ -102,7 +121,6 @@ class GRUGradKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
     auto* h0 = context.Input<Tensor>("H0");
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
     auto* weight = context.Input<Tensor>("Weight");
     const T* weight_data = weight->data<T>();
     auto* batch_gate = context.Input<LoDTensor>("BatchGate");
@@ -135,25 +153,36 @@ class GRUGradKernel : public framework::OpKernel<T> {
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
 
+    Tensor ordered_h0, ordered_h0_grad;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+    }
+    if (h0_grad) {
+      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
+      zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
+    }
+
     bool is_reverse = context.Attr<bool>("is_reverse");
     batch_hidden_grad.set_lod(batch_hidden->lod());
     to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
 
     math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
 
     math::hl_gru_grad<T> gru_grad;
     if (weight_grad) {
-      gru_grad.gateWeightGrad =
+      gru_grad.gate_weight_grad =
           weight_grad->mutable_data<T>(context.GetPlace());
       zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-      gru_grad.stateWeightGrad =
+      gru_grad.state_weight_grad =
           weight_grad->data<T>() + 2 * frame_size * frame_size;
     } else {
-      gru_grad.gateWeightGrad = nullptr;
-      gru_grad.stateWeightGrad = nullptr;
+      gru_grad.gate_weight_grad = nullptr;
+      gru_grad.state_weight_grad = nullptr;
     }
 
     auto batch_starts = batch_hidden_grad.lod()[0];
@@ -164,32 +193,27 @@ class GRUGradKernel : public framework::OpKernel<T> {
       int cur_batch_size = bend - bstart;
 
       Tensor gate_t = batch_gate->Slice(bstart, bend);
-      gru_value.gateValue = gate_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
       Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
 
       Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
-      gru_grad.outputGrad = hidden_grad_t.data<T>();
+      gru_grad.output_grad = hidden_grad_t.data<T>();
       Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
-      gru_grad.gateGrad = gate_grad_t.data<T>();
+      gru_grad.gate_grad = gate_grad_t.data<T>();
       Tensor reset_hidden_prev_grad_t =
           batch_reset_hidden_prev_grad.Slice(bstart, bend);
-      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
+      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
       if (n == 0) {
-        gru_value.prevOutValue = const_cast<T*>(h0_data);
-        if (h0_grad) {
-          T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
-          zero(dev_ctx, h0_grad, static_cast<T>(0.0));
-          gru_grad.prevOutGrad = h0_grad_data;
-        } else {
-          gru_grad.prevOutGrad = nullptr;
-        }
+        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_grad.prev_out_grad =
+            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
       } else {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
         Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
-        gru_value.prevOutValue = hidden_prev_t.data<T>();
+        gru_value.prev_out_value = hidden_prev_t.data<T>();
         Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
-        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
+        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
 
       math::GRUUnitGradFunctor<Place, T>::compute(
@@ -208,6 +232,10 @@ class GRUGradKernel : public framework::OpKernel<T> {
       math::ColwiseSum<Place, T> col_sum;
       col_sum(dev_ctx, batch_gate_grad, bias_grad);
     }
+    if (h0 && h0_grad) {
+      ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
+                                 order, h0_grad, false);
+    }
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
index 89c027ff1eea93012dc5ab22b081786efc328e96..877c969103cfc17e1b170449d1922d9c7db2a58b 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -114,18 +114,19 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(sigmoid)
         .InEnum({identity, sigmoid, tanh, relu});
     AddComment(R"DOC(
-GRUUnit Operator.
-
-This operator implements partial calculations of the GRU unit as follows:
+GRUUnit Operator implements partial calculations of the GRU unit as following:
 
 $$
-update \ gate: u_t = actGate(xu_t + W_u * hidden_{prev} + bias_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * hidden_{prev} + bias_r)  \\
-output \ candidate: {h}_t = actNode({xc}_t + W_c * dot(r_t, hidden_{prev}) + bias_c) \\
-output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_{prev})
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
 $$
 
-The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+which is same as one time step of GRU Operator.
+
+@note To implement the complete GRU unit, fully-connected operator must be 
+used before to feed xu, xr and xc as the Input of GRUUnit operator.
 
 )DOC");
   }
@@ -150,12 +151,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
                    "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput("Hidden"),
                    "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "Gate");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
                    "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
                    "Hidden");
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
index c53e7d9827e0395e6ce613302e732b2797f83cdd..3398c0934e250cfc292776d08773204bb9b4d87e 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -28,6 +28,10 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
 template <typename Place, typename T>
@@ -110,7 +114,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     auto c = g.slice(c_offsets, extents);  // output candidate
 
     // calculate final output
-    h.device(place) = u * (h_p - c) + c;
+    h.device(place) = u * (c - h_p) + h_p;
   }
 };
 
@@ -146,35 +150,27 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     auto* weight_grad =
         context.Output<Tensor>(framework::GradVarName("Weight"));
     auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-    input_grad->mutable_data<T>(context.GetPlace());
-    hidden_prev_grad->mutable_data<T>(context.GetPlace());
-    weight_grad->mutable_data<T>(context.GetPlace());
     Tensor gate_grad;
-    gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
     Tensor reset_hidden_prev_grad;
-    reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
-                                           context.GetPlace());
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];
 
     const T* hidden_prev_data = hidden_prev->data<T>();
-    T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
     const T* weight_data = weight->data<T>();
-    T* weight_grad_data = weight_grad->data<T>();
-    T* gate_grad_data = gate_grad.data<T>();
+    T* gate_grad_data =
+        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
     const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
+        reset_hidden_prev->dims(), context.GetPlace());
 
     auto h_p = EigenMatrix<T>::From(*hidden_prev);
     auto g = EigenMatrix<T>::From(*gate);
     auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_x = EigenMatrix<T>::From(*input_grad);
-    auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
     auto d_g = EigenMatrix<T>::From(gate_grad);
     auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
     auto place = context.GetEigenDevice<Place>();
 
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
     Eigen::array<int, 2> u_offsets({{0, 0}});
     auto u = g.slice(u_offsets, extents);  // update gate
@@ -185,42 +181,56 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
 
     // backward for unactivated update gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
     // backward for unactivated output candidate
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+                   d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
     math::gemm<Place, T>(context.device_context(), false, true, batch_size,
                          frame_size, frame_size, 1,
                          gate_grad_data + frame_size * 2, frame_size * 3,
                          weight_data + frame_size * frame_size * 2, frame_size,
                          0, reset_hidden_prev_grad_data, frame_size);
-    // backward for state_weight
-    math::gemm<Place, T>(
-        context.device_context(), true, false, frame_size, frame_size,
-        batch_size, 1, reset_hidden_prev_data, frame_size,
-        gate_grad_data + frame_size * 2, frame_size * 3, 0,
-        weight_grad_data + frame_size * frame_size * 2, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
-    // backward for update_gate_weight and reset_gate_weight
-    math::gemm<Place, T>(context.device_context(), true, false, frame_size,
-                         frame_size * 2, batch_size, 1, hidden_prev_data,
-                         frame_size, gate_grad_data, frame_size * 3, 0,
-                         weight_grad_data, frame_size * 2);
+    // backward for weight
+    if (weight_grad) {
+      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
+      // backward for state_weight
+      math::gemm<Place, T>(
+          context.device_context(), true, false, frame_size, frame_size,
+          batch_size, 1, reset_hidden_prev_data, frame_size,
+          gate_grad_data + frame_size * 2, frame_size * 3, 0,
+          weight_grad_data + frame_size * frame_size * 2, frame_size);
+
+      // backward for update_gate_weight and reset_gate_weight
+      math::gemm<Place, T>(context.device_context(), true, false, frame_size,
+                           frame_size * 2, batch_size, 1, hidden_prev_data,
+                           frame_size, gate_grad_data, frame_size * 3, 0,
+                           weight_grad_data, frame_size * 2);
+    }
     // backward for hidden_prev
-    d_h_p.device(place) = d_r_h_p * r + d_h * u;
-    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                         frame_size, frame_size * 2, 1, gate_grad_data,
-                         frame_size * 3, weight_data, frame_size * 2, 1,
-                         hidden_prev_grad_data, frame_size);
+    if (hidden_prev_grad) {
+      T* hidden_prev_grad_data =
+          hidden_prev_grad->mutable_data<T>(context.GetPlace());
+      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
+      math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                           frame_size, frame_size * 2, 1, gate_grad_data,
+                           frame_size * 3, weight_data, frame_size * 2, 1,
+                           hidden_prev_grad_data, frame_size);
+    }
     // backward for input
-    d_x.device(place) = d_g;
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      auto d_x = EigenMatrix<T>::From(*input_grad);
+      d_x.device(place) = d_g;
+    }
     // backward for bias
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      auto d_b = EigenVector<T>::Flatten(*bias_grad);
       d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
     }
   }
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
index 3435e74b0afb470fcbd1c0f4e06ad363352cac00..938803d5b36177c782fe40bc34fd92504e5bbf7b 100644
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -70,11 +70,18 @@ input value and Y as the target value. Huber loss can evaluate the fitness of
 X to Y. Different from MSE loss, Huber loss is more robust for outliers. The
 shape of X and Y are [batch_size, 1]. The equation is:
 
-L_{\delta}(y, f(x)) =
+$$
+Out_{\delta}(X, Y)_i =
 \begin{cases}
-0.5 * (y - f(x))^2, \quad |y - f(x)| \leq \delta \\
-\delta * (|y - f(x)| - 0.5 * \delta),   \quad otherwise
+0.5 * (Y_i - X_i)^2,
+\quad |Y_i - X_i| \leq \delta \\
+\delta * (|Y_i - X_i| - 0.5 * \delta),
+\quad otherwise
 \end{cases}
+$$
+
+In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 
 )DOC");
   }
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 066bdf67aa037e9c25cfdfaff7ec8771eb59cde8..8e079a14e0a15e8ff803b6087e6b0b02083479ef 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -32,19 +32,19 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
              "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
              "operator. See more details in the operator's comments.");
     AddInput("Label",
-             "(LoDTensor, default LoDTensor<int>) A LoDTensor with shape "
+             "(LoDTensor, default LoDTensor<int64_t>) A LoDTensor with shape "
              "[N x 1], where N is the total element number in a mini-batch. "
              "The ground truth.");
     AddOutput(
         "Alpha",
         "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
-        "The forward vectors for the entire batch. Denote it as \f$\alpha\f$. "
-        "\f$\alpha$\f is a memo table used to calculate the normalization "
-        "factor in CRF. \f$\alpha[k, v]$\f stores the unnormalized "
+        "The forward vectors for the entire batch. Denote it as $\alpha$. "
+        "$\alpha$ is a memo table used to calculate the normalization "
+        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
         "probabilites of all possible unfinished sequences of tags that end at "
-        "position \f$k$\f with tag \f$v$\f. For each \f$k$\f, "
-        "\f$\alpha[k, v]$\f is a vector of length \f$D$\f with a component for "
-        "each tag value \f$v$\f. This vector is called a forward vecotr and "
+        "position $k$ with tag $v$. For each $k$, "
+        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
+        "each tag value $v$. This vector is called a forward vecotr and "
         "will also be used in backward computations.")
         .AsIntermediate();
     AddOutput(
@@ -73,9 +73,9 @@ LinearChainCRF Operator.
 
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
-variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
-\f$X = (x_1, x_2, ... , x_n)\f$ are structured inputs and
-\f$Y = (y_1, y_2, ... , y_n)\f$ are labels for the inputs.
+variables. CRF learns the conditional probability $P(Y|X)$, where
+$X = (x_1, x_2, ... , x_n)$ are structured inputs and
+$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
 
 Linear chain CRF is a special case of CRF that is useful for sequence labeling
 task. Sequence labeling tasks do not assume a lot of conditional
@@ -88,21 +88,22 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
 http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
 
 Equation:
-1. Denote Input(Emission) to this operator as \f$x\f$ here.
+1. Denote Input(Emission) to this operator as $x$ here.
 2. The first D values of Input(Transition) to this operator are for starting
-weights, denoted as \f$a\f$ here.
+weights, denoted as $a$ here.
 3. The next D values of Input(Transition) of this operator are for ending
-weights, denoted as \f$b\f$ here.
+weights, denoted as $b$ here.
 4. The remaning values of Input(Transition) are for transition weights,
-denoted as \f$w\f$ here.
-5. Denote Input(Label) as \f$s\f$ here.
-
-The probability of a sequence \f$s\f$ of length \f$L\f$ is defined as:
-\f$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
-                 + \sum_{l=1}^L x_{s_l}
-                 + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
-all possible sequences is \f$1\f$, and \f$x\f$ is the emission feature weight
+denoted as $w$ here.
+5. Denote Input(Label) as $s$ here.
+
+The probability of a sequence $s$ of length $L$ is defined as:
+$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+                + \sum_{l=1}^L x_{s_l}
+                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
+
+where $Z$ is a normalization value so that the sum of $P(s)$ over
+all possible sequences is 1, and $x$ is the emission feature weight
 to the linear chain CRF.
 
 Finally, the linear chain CRF operator outputs the logarithm of the conditional
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index ddf73981751798c72cef08f2dd5c87580b45aec3..014bbfa7580011e38a2f546e30d1e584965a7815 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -195,7 +195,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyLoDTensor = [](const platform::DeviceContext& ctx,
                             const LoDTensor& src, LoDTensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      dst->CopyFrom(src, platform::CPUPlace(), ctx);
+      framework::CopyFrom(src, platform::CPUPlace(), ctx, dst);
     };
 
     copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
@@ -203,8 +203,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
 
     transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
                                             platform::CPUPlace());
-    transition_weights_dst->CopyFrom(transition_weights_src,
-                                     platform::CPUPlace(), ctx);
+    framework::CopyFrom(transition_weights_src, platform::CPUPlace(), ctx,
+                        transition_weights_dst);
   }
 
   void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
@@ -219,7 +219,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(platform::GPUPlace());
-      dst->CopyFrom(src, platform::GPUPlace(), ctx);
+      framework::CopyFrom(src, platform::GPUPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -271,7 +271,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     ll -= std::log(sum);
     // Now ll is equal to -log(Z).
 
-    const int* lbl = label.data<int>();
+    const int64_t* lbl = label.data<int64_t>();
     PADDLE_ENFORCE_LT(
         static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
         "An invalid tag label that execesses the largest tag number.");
@@ -410,12 +410,12 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // Copy the inputs from GPU memory to CPU memory when this operators runs on
     // GPU device.
     label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
-    label_dst->CopyFrom(label_src, platform::CPUPlace(), ctx);
+    framework::CopyFrom(label_src, platform::CPUPlace(), ctx, label_dst);
 
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      dst->CopyFrom(src, platform::CPUPlace(), ctx);
+      framework::CopyFrom(src, platform::CPUPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -434,7 +434,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
                          Tensor* dst) {
       if (src && dst) {
         dst->mutable_data<T>(platform::GPUPlace());
-        dst->CopyFrom(*src, platform::GPUPlace(), ctx);
+        framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst);
       }
     };
     copyTensor(ctx, emission_grad_src, emission_grad_dst);
@@ -449,7 +449,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
                            Tensor* emission_grad) const {
     const T* w_exps = transition_exps.data<T>();
     const T* x_exps = emission_exps.data<T>();
-    const int* label_value = label.data<int>();
+    const int64_t* label_value = label.data<int64_t>();
     T* beta_value = beta->data<T>();
 
     auto x_dims = emission_exps.dims();
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index b71a33a6b1ce80b545e6d7a4020dafc941dc55d2..4e58b84430f2a8697bbbc1acf971fd063120f563 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase {
                    out_var_name);
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    uint32_t version;
-    fin.read(reinterpret_cast<char *>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-    framework::TensorDesc desc;
-    {  // int32_t size
-       // proto buffer
-      int32_t size;
-      fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      fin.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                     "Cannot parse tensor desc");
-    }
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-      tensor->Resize(framework::make_ddim(dims));
-
-      void *buf;
-      platform::Place cpu = platform::CPUPlace();
-      switch (desc.data_type()) {
-        case framework::FP32:
-          buf = tensor->mutable_data<float>(cpu);
-          break;
-        case framework::FP64:
-          buf = tensor->mutable_data<double>(cpu);
-          break;
-        case framework::INT32:
-          buf = tensor->mutable_data<int>(cpu);
-          break;
-        case framework::INT64:
-          buf = tensor->mutable_data<int64_t>(cpu);
-          break;
-        default:
-          PADDLE_THROW("DataType %d not supported", desc.data_type());
-      }
-      fin.read(static_cast<char *>(buf), tensor->memory_size());
-    }
-    {  // read lod
-      uint64_t lod_level;
-      fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-      for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size;
-        fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        fin.read(reinterpret_cast<char *>(tmp.data()),
-                 static_cast<std::streamsize>(size));
-        lod[i] = tmp;
-      }
-    }
+    framework::DeserializeFromStream(fin, tensor);
 
     auto place = dev_ctx.GetPlace();
     if (platform::is_gpu_place(place)) {
@@ -105,7 +51,7 @@ class LoadOp : public framework::OperatorBase {
       out_var->Clear();
       tensor = out_var->GetMutable<framework::LoDTensor>();
       tensor->set_lod(cpu_tensor.lod());
-      tensor->CopyFrom(cpu_tensor, place, dev_ctx);
+      CopyFrom(cpu_tensor, place, dev_ctx, tensor);
     }
   }
 };
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
index 2bb916ccee80c83a02ea429fe95f5fafc86ccfa6..cbcbf80adc3cf68f9eb28bbe2a69168cc8798347 100644
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
@@ -33,7 +33,8 @@ class LoDResetKernel : public framework::OpKernel<T> {
       auto* lod = lod_t->data<int>();
       if (platform::is_gpu_place(ctx.GetPlace())) {
         framework::Tensor lod_cpu;
-        lod_cpu.CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context());
+        framework::CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context(),
+                            &lod_cpu);
         lod = lod_cpu.data<int>();
       }
       level0 = std::vector<int>(lod, lod + lod_t->numel());
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index 58af35564d83b9699af4f7783fb6367ff9590682..010c79d4e153463d4b2e48e5fd798d3bc4febaf1 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -81,11 +81,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
           continue;
         }
         // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
-        out[i]
-            .Slice(static_cast<int>(offset), static_cast<int>(offset + len))
-            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
-                              static_cast<int>(each_range.end)),
-                      x.place(), dev_ctx);
+        auto slice = out[i].Slice(static_cast<int>(offset),
+                                  static_cast<int>(offset + len));
+        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                                    static_cast<int>(each_range.end)),
+                            x.place(), dev_ctx, &slice);
         offset += len;
       }
     }
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..257e5c8a49e935dcbdc33e5060118ef1804fa8d7
--- /dev/null
+++ b/paddle/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+  }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6c189ef3412d7a56205502c7913e93218a03b929
--- /dev/null
+++ b/paddle/operators/log_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..73404fce9157fa750a51451fa93646bc4059481a
--- /dev/null
+++ b/paddle/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+
+    loss_out->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a37582c1d840ac11f847d8743c824ef1aef0fd66
--- /dev/null
+++ b/paddle/operators/logical_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/logical_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BinaryLogicalOpProtoMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) Left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y",
+             string::Sprintf("(LoDTensor) Right hand operand of %s operator",
+                             comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors.
+Each element of Out is calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UnaryLogicalOpProtoMaker(framework::OpProto *proto,
+                           framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
+                                  comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors.
+Each element of Out is calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class BinaryLogicalOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of %s operator must not be null", comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"),
+                   "Input(Y) of %s operator must not be null", comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
+                      "The number of elements in X and Y should be same");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+template <typename OpComment>
+class UnaryLogicalOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of %s operator must not be null", comment.type);
+    auto dim_x = context->GetInputDim("X");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class LogicalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    // LogicalOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation)                     \
+  struct _##op_type##Comment {                                             \
+    static char type[];                                                    \
+    static char equation[];                                                \
+  };                                                                       \
+  char _##op_type##Comment::type[]{#op_type};                              \
+  char _##op_type##Comment::equation[]{_equation};                         \
+  REGISTER_OPERATOR(                                                       \
+      op_type, ::paddle::operators::LogicalOp,                             \
+      ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation)                     \
+  struct _##op_type##Comment {                                            \
+    static char type[];                                                   \
+    static char equation[];                                               \
+  };                                                                      \
+  char _##op_type##Comment::type[]{#op_type};                             \
+  char _##op_type##Comment::equation[]{_equation};                        \
+  REGISTER_OPERATOR(                                                      \
+      op_type, ::paddle::operators::LogicalOp,                            \
+      ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+REGISTER_BINARY_LOGICAL_OP(logical_and, "Out = X && Y");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
+                               paddle::operators::LogicalAndFunctor);
+REGISTER_BINARY_LOGICAL_OP(logical_or, "Out = X && Y");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
+                               paddle::operators::LogicalOrFunctor);
+REGISTER_UNARY_LOGICAL_OP(logical_not, "Out = !X");
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
+                              paddle::operators::LogicalNotFunctor);
+REGISTER_BINARY_LOGICAL_OP(logical_xor, "Out = (X || Y) && !(X && Y)");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
+                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d41239b2ca43e7145ea56afcb0af69948838cc48
--- /dev/null
+++ b/paddle/operators/logical_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/logical_op.h"
+
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, GPU,
+                               paddle::operators::LogicalAndFunctor);
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, GPU,
+                               paddle::operators::LogicalOrFunctor);
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, GPU,
+                              paddle::operators::LogicalNotFunctor);
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, GPU,
+                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e78a7d6ed87ba950886e6bc667f82118ff78904
--- /dev/null
+++ b/paddle/operators/logical_op.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LogicalAndFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; }
+};
+
+template <typename T>
+struct LogicalOrFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; }
+};
+
+template <typename T>
+struct LogicalNotFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a) const { return !a; }
+};
+
+template <typename T>
+struct LogicalXorFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    return (a || b) && !(a && b);
+  }
+};
+
+template <typename Place, typename Functor>
+class BinaryLogicalOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor binary_func;
+    platform::Transform<Place> trans;
+    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
+          binary_func);
+  }
+};
+
+template <typename Place, typename Functor>
+class UnaryLogicalOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor unary_func;
+    platform::Transform<Place> trans;
+    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+          out->mutable_data<bool>(context.GetPlace()), unary_func);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
+  REGISTER_OP_##dev##_KERNEL(                                 \
+      op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
+                   ::paddle::platform::dev##Place, functor<bool>>);
+
+#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
+  REGISTER_OP_##dev##_KERNEL(                                \
+      op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
+                   ::paddle::platform::dev##Place, functor<bool>>);
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index b9417f1d7fdc663fff751328d18239af3dbb1216..bf47879f772a3013bd7ce78c6f8a6aefe65298f9 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
     nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
@@ -13,9 +13,11 @@ if(WITH_GPU)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
+    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
     nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
     cc_library(softmax SRCS softmax.cc DEPS device_context)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
@@ -25,6 +27,8 @@ else()
     cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
+    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
     cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()
 
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index 72f4202bace4461d2597204feaa2a21e355bd1ac..d853507188cf8c80aede1e7646736036e30c9678 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -149,7 +149,7 @@ class ContextProjectFunctor {
             Tensor out_t_sub = out_t.Slice(k * context_length,
                                            k * context_length + padding_size);
             Tensor w_sub = padding_data.Slice(k, k + padding_size);
-            out_t_sub.CopyFrom(w_sub, context.GetPlace(), context);
+            framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         if (down_pad > 0) {  // add down pad
@@ -179,7 +179,7 @@ class ContextProjectFunctor {
                 (down_pad_begin_row + t) * context_length);
             Tensor w_sub = padding_data.Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            out_t_sub.CopyFrom(w_sub, context.GetPlace(), context);
+            framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
index 51af140cf4d5e6581765bea00033fa53d383230d..4c67dec9cbeb48f400f79f5ed7ba3c939fa2540c 100644
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -25,393 +25,397 @@ namespace detail {
 #ifndef __NVCC__
 
 template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
-                                       T *gateValue, T *resetOutputValue,
-                                       T *prevOutputValue, int frameSize,
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size,
                                        activation_mode_t active_gate) {
-  T rValueUpdateGate;
-  T rValueResetGate;
-  T rValueResetOutput;
-  T rPrevOut = 0;
-  T *updateGate = gateValue;
-  T *resetGate = gateValue + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
     }
 
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
-                  rValueResetOutput, active_gate);
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
 
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    resetOutputValue[i] = rValueResetOutput;
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
   }
 }
 
 template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
-                                       T *gateValue, T *prevOutputValue,
-                                       T *outputValue, int frameSize,
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size,
                                        activation_mode_t active_node) {
-  T rValueUpdateGate;
-  T rValueFrameState;
-  T rPrevOut = 0;
-  T rOutput;
-  T *updateGate = gateValue;
-  T *frameState = gateValue + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
     }
 
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                  active_node);
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
 
-    frameState[i] = rValueFrameState;
-    outputValue[i] = rOutput;
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
   }
 }
 
 template <class OpResetOutput, typename T>
-void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
-                                     T *resetOutputValue, T *prevOutputValue,
-                                     int frameSize,
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                     T *gate_value, T *reset_output_value,
+                                     T *prev_output_value, int frame_size,
                                      activation_mode_t active_gate) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueResetGate;
-  __m256 rValueResetOutput;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 *updateGate = (__m256 *)gateValue;
-  __m256 *resetGate = (__m256 *)(gateValue + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+  __m256 r_value_update_gate;
+  __m256 r_value_reset_gate;
+  __m256 r_value_reset_output;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
     }
 
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
-                  rValueResetOutput, active_gate);
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
 
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
   }
 #endif
 }
 
 template <class OpFinalOutput, typename T>
-void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
-                                     T *prevOutputValue, T *outputValue,
-                                     int frameSize,
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
+                                     T *gate_value, T *prev_output_value,
+                                     T *output_value, int frame_size,
                                      activation_mode_t active_node) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueFrameState;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 rOutput;
-  __m256 *updateGate = (__m256 *)gateValue;
-  __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+  __m256 r_value_update_gate;
+  __m256 r_value_frame_state;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 r_output;
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
     }
 
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                  active_node);
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
 
-    frameState[i] = rValueFrameState;
-    ((__m256 *)outputValue)[i] = rOutput;
+    frame_state[i] = r_value_frame_state;
+    ((__m256 *)output_value)[i] = r_output;
   }
 #endif
 }
 
 template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput opResetOutput,
-                                 hl_gru_value<T> value, int frameSize,
-                                 int batchSize, activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 hl_gru_value<T> value, int frame_size,
+                                 int batch_size,
+                                 activation_mode_t active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
-          value.prevOutValue, frameSize, active_gate);
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
     } else {
       hl_naive_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
-          value.prevOutValue, frameSize, active_gate);
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
     }
 
-    value.gateValue += frameSize * 3;
-    value.resetOutputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
   }
 }
 
 template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput opFinalOutput,
-                                 hl_gru_value<T> value, int frameSize,
-                                 int batchSize, activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
-                                      value.prevOutValue, value.outputValue,
-                                      frameSize, active_node);
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 hl_gru_value<T> value, int frame_size,
+                                 int batch_size,
+                                 activation_mode_t active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
     } else {
-      hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
-                                        value.prevOutValue, value.outputValue,
-                                        frameSize, active_node);
+      hl_naive_gru_forward_final_output(
+          op_final_output, value.gate_value, value.prev_out_value,
+          value.output_value, frame_size, active_node);
     }
 
-    value.gateValue += frameSize * 3;
-    value.outputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
   }
 }
 
 template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
-                                      T *gateGrad, T *prevOutValue,
-                                      T *prevOutGrad, T *outputGrad,
-                                      int frameSize,
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *output_grad,
+                                      int frame_size,
                                       activation_mode_t active_node) {
-  T rUpdateGateValue;
-  T rUpdateGateGrad;
-  T rFrameStateValue;
-  T rFrameStateGrad;
-  T rOutGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T *updateGateValue = gateValue;
-  T *updateGateGrad = gateGrad;
-  T *frameStateValue = gateValue + frameSize * 2;
-  T *frameStateGrad = gateGrad + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad = outputGrad[i];
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_frame_state_value;
+  T r_frame_state_grad;
+  T r_out_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *frame_state_value = gate_value + frame_size * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = output_grad[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = prevOutGrad[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
     }
 
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-                active_node);
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
     }
   }
 }
 
 template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
-                                      T *gateGrad, T *prevOutValue,
-                                      T *prevOutGrad, T *resetOutputGrad,
-                                      int frameSize,
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *reset_output_grad,
+                                      int frame_size,
                                       activation_mode_t active_gate) {
-  T rUpdateGateValue;
-  T rUpdateGateGrad;
-  T rResetGateValue;
-  T rResetGateGrad;
-  T rResetOutputGrad = 0;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T *updateGateValue = gateValue;
-  T *updateGateGrad = gateGrad;
-  T *resetGateValue = gateValue + frameSize;
-  T *resetGateGrad = gateGrad + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = resetOutputGrad[i];
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_reset_gate_value;
+  T r_reset_gate_grad;
+  T r_reset_output_grad = 0;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *reset_gate_value = gate_value + frame_size;
+  T *reset_gate_grad = gate_grad + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = reset_output_grad[i];
     }
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = prevOutGrad[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
     }
 
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-                active_gate);
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
     }
   }
 }
 
 template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
-                                    T *gateGrad, T *prevOutValue,
-                                    T *prevOutGrad, T *outputGrad,
-                                    int frameSize,
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *output_grad,
+                                    int frame_size,
                                     activation_mode_t active_node) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rFrameStateValue;
-  __m256 rFrameStateGrad;
-  __m256 rOutGrad;
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
-  __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
-  __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad = ((__m256 *)outputGrad)[i];
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_frame_state_value;
+  __m256 r_frame_state_grad;
+  __m256 r_out_grad;
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
+  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = ((__m256 *)output_grad)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
     }
 
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-                active_node);
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
     }
   }
 #endif
 }
 
 template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
-                                    T *gateGrad, T *prevOutValue,
-                                    T *prevOutGrad, T *resetOutputGrad,
-                                    int frameSize,
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *reset_output_grad,
+                                    int frame_size,
                                     activation_mode_t active_gate) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rResetGateValue;
-  __m256 rResetGateGrad;
-  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
-  __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
-  __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_reset_gate_value;
+  __m256 r_reset_gate_grad;
+  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
+  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
     }
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
     }
 
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-                active_gate);
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
     }
   }
 #endif
 }
 
 template <class OpStateGrad, typename T>
-inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
-                                hl_gru_grad<T> grad, int frameSize,
-                                int batchSize, activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_state_grad(OpStateGrad op_state_grad,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                int frame_size, int batch_size,
+                                activation_mode_t active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
     } else {
       hl_naive_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
     }
 
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
 
-    grad.gateGrad += frameSize * 3;
-    grad.outputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
+    grad.gate_grad += frame_size * 3;
+    grad.output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
     }
   }
 }
 
 template <class OpResetGrad, typename T>
-inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
-                                hl_gru_grad<T> grad, int frameSize,
-                                int batchSize, activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                int frame_size, int batch_size,
+                                activation_mode_t active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
     } else {
       hl_naive_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
     }
 
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
 
-    grad.gateGrad += frameSize * 3;
-    grad.resetOutputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
+    grad.gate_grad += frame_size * 3;
+    grad.reset_output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
     }
   }
 }
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
index 6441c648b048422c110872a85aa8cb719f11a8d7..d2edcb7f258b387530799b967fc0fff61acc5b83 100644
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -27,174 +27,174 @@ namespace math {
 namespace detail {
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpResetOutput, bool isBatch, typename T>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
-                                        T *gateValue, T *resetOutputValue,
-                                        T *prevOutputValue, int frameSize,
-                                        int batchSize,
+template <class OpResetOutput, bool is_batch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
+                                        T *gate_value, T *reset_output_value,
+                                        T *prev_output_value, int frame_size,
+                                        int batch_size,
                                         activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    resetOutputValue += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
   }
 
-  T rPrevOut = 0;
-  T rValueResetOutput;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+  T r_prev_out = 0;
+  T r_value_reset_output;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
 
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
   }
 
-  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
-                active_gate);
+  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                  r_value_reset_output, active_gate);
 
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
+  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
+  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
+  reset_output_value[frame_idx] = r_value_reset_output;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpFinalOutput, bool isBatch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
-                                        T *gateValue, T *prevOutputValue,
-                                        T *outputValue, int frameSize,
-                                        int batchSize,
+template <class OpFinalOutput, bool is_batch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value, T *prev_output_value,
+                                        T *output_value, int frame_size,
+                                        int batch_size,
                                         activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    outputValue += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
   }
 
-  T rOutput;
-  T rPrevOut = 0;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+  T r_output;
+  T r_prev_out = 0;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
 
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
   }
 
-  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                active_node);
+  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                  r_output, active_node);
 
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
-  outputValue[frameIdx] = rOutput;
+  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
+  output_value[frame_idx] = r_output;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpStateGrad, bool isBatch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
-                                       T *gateGrad, T *prevOutValue,
-                                       T *prevOutGrad, T *outputGrad,
-                                       int frameSize, int batchSize,
+template <class OpStateGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *output_grad,
+                                       int frame_size, int batch_size,
                                        activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad += batchIdx * 3 * frameSize;
-    outputGrad += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    output_grad += batch_idx * frame_size;
   }
 
-  T rUpdateGateGrad;
-  T rFrameStateGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
-  T rOutGrad = outputGrad[frameIdx];
+  T r_update_gate_grad;
+  T r_frame_state_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
+  T r_out_grad = output_grad[frame_idx];
 
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
 
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
-    rPrevOutGrad = prevOutGrad[frameIdx];
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_grad = prev_out_grad[frame_idx];
   }
 
-  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-              active_node);
+  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                r_out_grad, active_node);
 
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
   }
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpResetGrad, bool isBatch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
-                                       T *gateGrad, T *prevOutValue,
-                                       T *prevOutGrad, T *resetOutputGrad,
-                                       int frameSize, int batchSize,
+template <class OpResetGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *reset_output_grad,
+                                       int frame_size, int batch_size,
                                        activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad += batchIdx * 3 * frameSize;
-    resetOutputGrad += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    reset_output_grad += batch_idx * frame_size;
   }
 
-  T rResetGateGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T rResetOutputGrad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
-  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-    rPrevOutGrad = prevOutGrad[frameIdx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
+  T r_reset_gate_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_reset_output_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
+  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
+    r_reset_output_grad = reset_output_grad[frame_idx];
   }
 
-  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-              active_gate);
+  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                r_reset_output_grad, active_gate);
 
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
   }
 }
 }  // namespace detail
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
index 8a681d8d8bced72e1296f863489f6ccbc7913167..acd84be01db9ddaf06d165d8be353b253f324dd2 100644
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -28,23 +28,25 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
-                             T &valueResetOutput, activation_mode_t actGate) {
-    valueUpdateGate = activation(valueUpdateGate, actGate);
-    valueResetGate = activation(valueResetGate, actGate);
-    valueResetOutput = prevOut * valueResetGate;
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
+                             T &prev_out, T &value_reset_output,
+                             activation_mode_t act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = prev_out * value_reset_gate;
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
-                             __m256 &prevOut, __m256 &valueResetOutput,
-                             activation_mode_t actGate) {
-    valueUpdateGate = activation(valueUpdateGate, actGate);
-    valueResetGate = activation(valueResetGate, actGate);
-    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_reset_gate, __m256 &prev_out,
+                             __m256 &value_reset_output,
+                             activation_mode_t act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
   }
 #endif
 #endif
@@ -53,24 +55,26 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
-                             T &valueOutput, activation_mode_t actInput) {
-    valueFrameState = activation(valueFrameState, actInput);
-    valueOutput = prevOut - (valueUpdateGate * prevOut) +
-                  (valueUpdateGate * valueFrameState);
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
+                             T &prev_out, T &value_output,
+                             activation_mode_t act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = prev_out - (value_update_gate * prev_out) +
+                   (value_update_gate * value_frame_state);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
-                             __m256 &prevOut, __m256 &valueOutput,
-                             activation_mode_t actInput) {
-    valueFrameState = activation(valueFrameState, actInput);
-    valueOutput = _mm256_add_ps(
-        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
-        _mm256_mul_ps(valueUpdateGate, valueFrameState));
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_frame_state, __m256 &prev_out,
+                             __m256 &value_output,
+                             activation_mode_t act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = _mm256_add_ps(
+        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+        _mm256_mul_ps(value_update_gate, value_frame_state));
   }
 #endif
 #endif
@@ -82,34 +86,37 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
-                             T &valueFrameState, T &gradFrameState,
-                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
-                             activation_mode_t actInput) {
-    gradUpdateGate = (gradOutput * valueFrameState);
-    gradUpdateGate -= (gradOutput * valuePrevOut);
-    gradPrevOut -= (gradOutput * valueUpdateGate);
-    gradPrevOut += gradOutput;
-    gradFrameState =
-        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_frame_state, T &grad_frame_state,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_output, activation_mode_t act_input) {
+    grad_update_gate = (grad_output * value_frame_state);
+    grad_update_gate -= (grad_output * value_prev_out);
+    grad_prev_out -= (grad_output * value_update_gate);
+    grad_prev_out += grad_output;
+    grad_frame_state = activation(grad_output * value_update_gate,
+                                  value_frame_state, act_input);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
-                             __m256 &valueFrameState, __m256 &gradFrameState,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
-                             __m256 &gradOutput, activation_mode_t actInput) {
-    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
-    gradUpdateGate =
-        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
-    gradPrevOut = _mm256_add_ps(
-        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
-        gradOutput);
-    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
-                                valueFrameState, actInput);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate,
+                             __m256 &value_frame_state,
+                             __m256 &grad_frame_state, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_output,
+                             activation_mode_t act_input) {
+    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
+    grad_update_gate = _mm256_sub_ps(
+        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
+    grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(grad_prev_out,
+                      _mm256_mul_ps(grad_output, value_update_gate)),
+        grad_output);
+    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+                                  value_frame_state, act_input);
   }
 #endif
 #endif
@@ -118,30 +125,32 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
-                             T &valueResetGate, T &gradResetGate,
-                             T &valuePrevOut, T &gradPrevOut,
-                             T &gradResetOutput, activation_mode_t actGate) {
-    gradResetGate = (gradResetOutput * valuePrevOut);
-    gradPrevOut += (gradResetOutput * valueResetGate);
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_reset_gate, T &grad_reset_gate,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_reset_output, activation_mode_t act_gate) {
+    grad_reset_gate = (grad_reset_output * value_prev_out);
+    grad_prev_out += (grad_reset_output * value_reset_gate);
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
-                             __m256 &valueResetGate, __m256 &gradResetGate,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
-                             __m256 &gradResetOutput,
-                             activation_mode_t actGate) {
-    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
-    gradPrevOut = _mm256_add_ps(gradPrevOut,
-                                _mm256_mul_ps(gradResetOutput, valueResetGate));
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate, __m256 &value_reset_gate,
+                             __m256 &grad_reset_gate, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+                             activation_mode_t act_gate) {
+    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
+    grad_prev_out = _mm256_add_ps(
+        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
   }
 #endif
 #endif
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
index 125af449d3f700e24be5e4b7615c3b0e03fd4e5b..ae4e47b014a9cd1f656dd9332086aa4d1b7cbb52 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -21,29 +21,29 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
 #ifndef __NVCC__
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
-          value.gateValue, frameSize * 3);
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
     }
 
     detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frameSize, batchSize, active_gate);
+                                 frame_size, batch_size, active_gate);
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
-          value.gateValue + frameSize * 2, frameSize * 3);
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
     }
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
-                                 frameSize, batchSize, active_node);
+                                 frame_size, batch_size, active_node);
 #endif
   }
 };
@@ -51,41 +51,43 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::CPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate) {
 #ifndef __NVCC__
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
-                                grad, frameSize, batchSize, active_node);
+                                grad, frame_size, batch_size, active_node);
 
-    if (value.prevOutValue && grad.prevOutGrad) {
+    if (value.prev_out_value && grad.prev_out_grad) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
 
-      if (grad.stateWeightGrad) {
+      if (grad.state_weight_grad) {
         math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
       }
     }
 
     detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
-                                grad, frameSize, batchSize, active_gate);
+                                grad, frame_size, batch_size, active_gate);
 
-    if (grad.prevOutGrad && value.prevOutValue) {
+    if (grad.prev_out_grad && value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
-          grad.prevOutGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
 
-      if (grad.gateWeightGrad) {
+      if (grad.gate_weight_grad) {
         math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
       }
     }
 #endif
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
index 7b9e54ac029f6aa00553338435684097d6d02b25..0252bdbdb63fef2e4754057fc5b6d415cef0c29f 100644
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -21,66 +21,66 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::GPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
     dim3 threads;
     dim3 grid;
-    if (batchSize == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
-      grid = dim3(frameBlocks, 1);
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
     } else {
       threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
-          value.gateValue, frameSize * 3);
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
-          active_gate);
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
     } else {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
-          active_gate);
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
     }
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
-          value.gateValue + frameSize * 2, frameSize * 3);
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
           active_node);
     } else {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
           active_node);
     }
   }
@@ -89,80 +89,82 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::GPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate) {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
     dim3 threads;
     dim3 grid;
-    if (batchSize == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
-      grid = dim3(frameBlocks, 1);
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
     } else {
       threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruBackwardStateGrad<
           detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
-          batchSize, active_node);
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
     } else {
       detail::KeGruBackwardStateGrad<
           detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
-          batchSize, active_node);
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
     }
 
-    if (value.prevOutValue && grad.prevOutGrad) {
+    if (value.prev_out_value && grad.prev_out_grad) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
 
-      if (grad.stateWeightGrad) {
+      if (grad.state_weight_grad) {
         math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
       }
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruBackwardResetGrad<
           detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
-          batchSize, active_gate);
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
     } else {
       detail::KeGruBackwardResetGrad<
           detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
-          batchSize, active_gate);
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
     }
 
-    if (grad.prevOutGrad && value.prevOutValue) {
+    if (grad.prev_out_grad && value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
-          grad.prevOutGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
 
-      if (grad.gateWeightGrad) {
+      if (grad.gate_weight_grad) {
         math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
       }
     }
   }
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
index 1475fb38104f353857dfd968e46af98a6d52c52a..58ea59f68e91c647a6b29ce3e8bc7e5d25db9b9b 100644
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -22,28 +22,28 @@ namespace math {
 // TODO(guosheng): refine code style in gru_compute
 template <typename T>
 struct hl_gru_value {
-  T *gateWeight;
-  T *stateWeight;
-  T *gateValue;
-  T *resetOutputValue;
-  T *outputValue;
-  T *prevOutValue;
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
 };
 
 template <typename T>
 struct hl_gru_grad {
-  T *gateWeightGrad;
-  T *stateWeightGrad;
-  T *gateGrad;
-  T *resetOutputGrad;
-  T *outputGrad;
-  T *prevOutGrad;
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
 };
 
 template <typename Place, typename T>
 struct GRUUnitFunctor {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
@@ -51,8 +51,9 @@ struct GRUUnitFunctor {
 template <typename Place, typename T>
 struct GRUUnitGradFunctor {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
 
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index 347df7a0ffdec163c0479a71ec775a813930ba5f..bf7894243919571c2ab15d53690b1ef05bfcc6ee 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -119,8 +119,8 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
 
   if (index < n) {
     T val = 0;
-    int w = index % im_width;
-    int h = (index / im_width) % im_height;
+    int w = index % im_width + padding_width;
+    int h = (index / im_width) % im_height + padding_height;
     int c = index / (im_width * im_height);
 
     // compute the start and end of the output
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index deb60051beef56437cf75f0fa2cef90bbc0a209a..24fd9a06e9f5fbd50483429379cf3f46ff88bcaa 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 10c28da72ba9d3b94bb59c5cf00e7f5a2f28fd06..ae197a97ed8aa089b51be77a59a8ba6a98ac70ec 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -74,7 +74,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -99,7 +99,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -110,7 +110,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_ocf_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -130,7 +130,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
   col2im(*context, output_cfo, dilation, stride, padding, &input);
@@ -139,7 +139,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -151,7 +151,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
   col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
@@ -159,7 +159,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 58356a4b7783241ca0292829bf05dc1a8ed80c6c..3018e50a4f54592123df6b9cadd45ce525d7b3e1 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -297,7 +297,25 @@ void set_constant_with_place<platform::GPUPlace>(
 template struct RowwiseAdd<platform::GPUPlace, float>;
 template struct RowwiseAdd<platform::GPUPlace, double>;
 template struct ColwiseSum<platform::GPUPlace, float>;
-template struct ColwiseSum<platform::GPUPlace, double>;
+// template struct ColwiseSum<platform::GPUPlace, double>;
+// The ColwiseSum<platform::GPUPlace, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void ColwiseSum<platform::GPUPlace, double>::operator()(
+    const platform::DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  framework::Tensor one;
+  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  SetConstant<platform::GPUPlace, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::GPUPlace, double>(context, true, static_cast<int>(in_dims[0]),
+                                   static_cast<int>(in_dims[1]), 1.0,
+                                   input.data<double>(), one.data<double>(),
+                                   0.0, vector->data<double>());
+}
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index ffb99f53808c4316ede96b04e57aec4dae4134de..5a42854f22234629b3405ec2397143ef761a9d08 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -49,6 +49,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index 780d17ffc6539c5f4d67ebab5476d6f646840b41..d5d6f0c73bc6bce7a74db2c98fa9f884a0bcd9a2 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input1, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
       context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
-  out.CopyFrom(out_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input1, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({3, 3}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
       context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
-  out.CopyFrom(out_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input2, *gpu_place, context);
-  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) {
   paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
       context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
-  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
 
   // numpy code:
   // a = np.arange(6).reshape(2, 3)
@@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input2, *gpu_place, context);
-  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) {
   paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
       context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
-  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
   context.Wait();
 
   EXPECT_EQ(input3_ptr[0], 0);
@@ -205,14 +205,15 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CUDADeviceContext context(*gpu_place);
-  g_mat_a.CopyFrom(mat_a, *gpu_place, context);
-  g_vec_b.CopyFrom(vec_b, *gpu_place, context);
+  paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a);
+  paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b);
 
   paddle::operators::math::gemv<paddle::platform::GPUPlace, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
       g_data_b, 0., g_data_c);
 
-  vec_c.CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context);
+  paddle::framework::CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context,
+                              &vec_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9003962d33b70b8e21a0d6b78bf5a77981df409
--- /dev/null
+++ b/paddle/operators/math/maxouting.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// All tensors are in NCHW format, and the groups must be greater than 1
+template <typename T>
+class MaxOutFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    int fea_size = input_height * input_width;
+    // c_size means the output size of each sample
+    int c_size = fea_size * output_channels;
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int new_bindex = c_size * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int new_cindex = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          T ele = static_cast<T>(-FLT_MAX);
+          for (int ph = 0; ph < groups; ++ph) {
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
+            ele = ele > x ? ele : x;
+          }
+          output_data[(new_bindex + new_cindex + f)] = ele;
+        }
+      }
+    }
+  }
+};
+
+template <class T>
+class MaxOutGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    int fea_size = input_height * input_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int blen = fea_size * output_channels * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int clen = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          int input_idx0 = (blen + clen) * groups + f;
+          bool continue_match = true;
+          int output_idx = blen + clen + f;
+          for (int g = 0; g < groups && continue_match; ++g) {
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class MaxOutGradFunctor<platform::CPUPlace, float>;
+template class MaxOutGradFunctor<platform::CPUPlace, double>;
+template class MaxOutFunctor<platform::CPUPlace, float>;
+template class MaxOutFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c3fabcae081e24d92d50d0e2a2cad4a2e9872125
--- /dev/null
+++ b/paddle/operators/math/maxouting.cu
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void KernelMaxOut(const int nthreads, const T* input_data,
+                             const int channels, const int input_height,
+                             const int input_width, int groups,
+                             T* output_data) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    T ele = static_cast<T>(-FLT_MAX);
+    for (int g = 0; g < groups; ++g) {
+      T x = input_data[data_idx + g * feat_len];
+      ele = ele > x ? ele : x;
+    }
+    output_data[i] = ele;
+  }
+}
+template <typename T>
+__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
+                                 const T* output_data, const T* output_grad,
+                                 T* input_grad, const int channels,
+                                 const int input_height, const int input_width,
+                                 int groups) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    int max_index = -1;
+    bool continue_match = true;
+    for (int g = 0; g < groups && continue_match; ++g) {
+      if (input_data[data_idx + g * feat_len] == output_data[i]) {
+        max_index = data_idx + g * feat_len;
+        continue_match = false;
+        break;
+      }
+    }
+    if (max_index != -1) {
+      input_grad[max_index] += output_grad[index];
+    }
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads = output->numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxOut<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, input_channels,
+                              input_height, input_width, groups, output_data);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = output.numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxoutGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, output_data,
+                              output_grad_data, input_grad_data, input_channels,
+                              input_height, input_width, groups);
+  }
+};
+
+template class MaxOutGradFunctor<platform::GPUPlace, float>;
+template class MaxOutGradFunctor<platform::GPUPlace, double>;
+
+template class MaxOutFunctor<platform::GPUPlace, float>;
+template class MaxOutFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d9069b0b3ca3e7bad3b21a46985c52ef00f50e6
--- /dev/null
+++ b/paddle/operators/math/maxouting.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename Place, typename T>
+
+class MaxOutFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups);
+};
+
+template <typename Place, class T>
+class MaxOutGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
index ead89e146f32ef005b06f4f6f04224d691805d74..135984586a67f666425f81456148c3623ed7ef25 100644
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -498,8 +498,8 @@ template class Pool3dGradFunctor<
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
-class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
@@ -520,9 +520,9 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask->mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -535,7 +535,7 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
             int wend = std::min(wstart + ksize_width, input_width);
             wstart = std::max(wstart, 0);
 
-            T ele = static_cast<T>(-FLT_MAX);
+            T1 ele = static_cast<T1>(-FLT_MAX);
             int index = -1;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -563,8 +563,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
-class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& output_grad,
@@ -580,9 +580,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -602,18 +602,18 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float>;
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double>;
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float, int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;
 
 /*
  * All tensors are in NCDHW format.
  * Ksize, strides, paddings are three elements. These three elements represent
  * depth, height and width, respectively.
  */
-template <typename T>
-class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
@@ -639,9 +639,9 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
     const int input_stride = input_depth * input_height * input_width;
     const int output_stride = output_depth * output_height * output_width;
 
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask->mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -659,7 +659,7 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
               wstart = std::max(wstart, 0);
 
               int output_idx = (pd * output_height + ph) * output_width + pw;
-              T ele = static_cast<T>(-FLT_MAX);
+              T1 ele = static_cast<T1>(-FLT_MAX);
               int index = -1;
               for (int d = dstart; d < dend; ++d) {
                 for (int h = hstart; h < hend; ++h) {
@@ -691,8 +691,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
  * Ksize, strides, paddings are three elements. These three elements represent
  * depth, height and width, respectively.
  */
-template <typename T>
-class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& output_grad,
@@ -710,9 +710,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
     const int input_stride = input_depth * input_height * input_width;
     const int output_stride = output_depth * output_height * output_width;
 
-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -735,10 +735,10 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float>;
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double>;
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float, int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double, int>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
index 6d1138ad50cb095e85b4ceb44fa81731316f10dd..ca3560f264b59057fd655084f3d43adc617c6606 100644
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
@@ -658,13 +658,13 @@ template class Pool3dGradFunctor<
 template class Pool3dGradFunctor<
     platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
 
-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T* input_data, const int channels,
+    const int nthreads, const T1* input_data, const int channels,
     const int input_height, const int input_width, const int output_height,
     const int output_width, const int ksize_height, const int ksize_width,
     const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, T* output_data, T* mask_data) {
+    const int padding_width, T1* output_data, T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -681,7 +681,7 @@ __global__ void KernelMaxPool2dWithIdx(
     wstart = max(wstart, 0);
 
     input_data += (batch_idx * channels + c) * input_height * input_width;
-    T ele = -FLT_MAX;
+    T1 ele = -FLT_MAX;
     int max_index = -1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
@@ -697,13 +697,13 @@ __global__ void KernelMaxPool2dWithIdx(
   }
 }
 
-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, const T* output_grad, const T* mask_data,
+    const int nthreads, const T1* output_grad, const T2* mask_data,
     const int channels, const int input_height, const int input_width,
     const int output_height, const int output_width, const int ksize_height,
     const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T* input_grad) {
+    const int padding_height, const int padding_width, T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -724,7 +724,7 @@ __global__ void KernelMaxPool2DWithIdxGrad(
     int pw_end =
         min((w_offset + padding_width) / stride_width + 1, output_width);
 
-    T gradient = 0;
+    T1 gradient = 0;
     int input_current_featuremap_idx = h_offset * input_width + w_offset;
     int output_idx =
         (batch_idx * channels + c_offset) * output_height * output_width;
@@ -746,8 +746,8 @@ __global__ void KernelMaxPool2DWithIdxGrad(
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
-class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
@@ -767,9 +767,9 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
     const int padding_height = paddings[0];
     const int padding_width = paddings[1];
 
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask->mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -777,9 +777,9 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool2dWithIdx<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, output_data, mask_data);
@@ -791,8 +791,8 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
-class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& output_grad,
@@ -812,9 +812,9 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
     const int padding_height = paddings[0];
     const int padding_width = paddings[1];
 
-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -822,30 +822,30 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool2DWithIdxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, output_grad_data, mask_data,
-                              input_channels, input_height, input_width,
-                              output_height, output_width, ksize_height,
-                              ksize_width, stride_height, stride_width,
-                              padding_height, padding_width, input_grad_data);
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
+        nthreads, output_grad_data, mask_data, input_channels, input_height,
+        input_width, output_height, output_width, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float>;
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double>;
+template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float, int>;
+template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double, int>;
 
-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T* input_data, const int channels,
+    const int nthreads, const T1* input_data, const int channels,
     const int input_depth, const int input_height, const int input_width,
     const int output_depth, const int output_height, const int output_width,
     const int ksize_depth, const int ksize_height, const int ksize_width,
     const int stride_depth, const int stride_height, const int stride_width,
     const int padding_depth, const int padding_height, const int padding_width,
-    T* output_data, T* mask_data) {
+    T1* output_data, T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -865,7 +865,7 @@ __global__ void KernelMaxPool3DWithIdx(
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
 
-    T ele = -FLT_MAX;
+    T1 ele = -FLT_MAX;
     int max_index = -1;
     input_data +=
         (batch_idx * channels + c) * input_depth * input_height * input_width;
@@ -885,15 +885,15 @@ __global__ void KernelMaxPool3DWithIdx(
   }
 }
 
-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, const T* output_grad, const T* mask, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    T* input_grad) {
+    const int nthreads, const T1* output_grad, const T2* mask,
+    const int channels, const int input_depth, const int input_height,
+    const int input_width, const int output_depth, const int output_height,
+    const int output_width, const int ksize_depth, const int ksize_height,
+    const int ksize_width, const int stride_depth, const int stride_height,
+    const int stride_width, const int padding_depth, const int padding_height,
+    const int padding_width, T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -922,7 +922,7 @@ __global__ void KernelMaxPool3DWithIdxGrad(
     int pw_end =
         min((w_offset + padding_width) / stride_width + 1, output_width);
 
-    T gradient = 0;
+    T1 gradient = 0;
     int input_current_feature_map_idx =
         (d_offset * input_height + h_offset) * input_width + w_offset;
     int output_idx = (batch_idx * channels + c_offset) * output_depth *
@@ -949,8 +949,8 @@ __global__ void KernelMaxPool3DWithIdxGrad(
  * Ksize, strides, paddings are three elements. These three elements represent
  * depth, height and width, respectively.
  */
-template <typename T>
-class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
@@ -975,9 +975,9 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
     const int padding_height = paddings[1];
     const int padding_width = paddings[2];
 
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask->mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -986,9 +986,9 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdx<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
@@ -1001,8 +1001,8 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
  * Ksize, strides, paddings are three elements. These three elements represent
  * depth, height and width, respectively.
  */
-template <typename T>
-class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& output_grad,
@@ -1027,9 +1027,9 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
     const int padding_height = paddings[1];
     const int padding_width = paddings[2];
 
-    const T* output_grad_data = output_grad.data<T>();
-    const T* mask_data = mask.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    const T1* output_grad_data = output_grad.data<T1>();
+    const T2* mask_data = mask.data<T2>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1038,9 +1038,9 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_depth,
         input_height, input_width, output_depth, output_height, output_width,
         ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
@@ -1049,10 +1049,10 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float>;
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double>;
+template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float, int>;
+template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double, int>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
index f6719e1e628cdd2cf7445ec9cd05713bc4f14c84..19fbd8b4bb2469d3ce8a139ce30a48641dbd6e0f 100644
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -153,7 +153,7 @@ class MaxPool3dGradFunctor {
  * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
  * NCDHW format.
  */
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool2dWithIndexFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
@@ -162,7 +162,7 @@ class MaxPool2dWithIndexFunctor {
                   framework::Tensor* output, framework::Tensor* mask);
 };
 
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool2dWithIndexGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
@@ -172,7 +172,7 @@ class MaxPool2dWithIndexGradFunctor {
                   framework::Tensor* input_grad);
 };
 
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool3dWithIndexFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
@@ -181,7 +181,7 @@ class MaxPool3dWithIndexFunctor {
                   framework::Tensor* output, framework::Tensor* mask);
 };
 
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool3dWithIndexGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
index 075196b47eeaf118a588b96532d87a05e4e600c6..514f2adef284c8877e2e74b943b4e6419c6ae721 100644
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -145,6 +145,8 @@ struct SelectedRowsAddTo<platform::CPUPlace, T> {
 
 template struct SelectedRowsAddTo<platform::CPUPlace, float>;
 template struct SelectedRowsAddTo<platform::CPUPlace, double>;
+template struct SelectedRowsAddTo<platform::CPUPlace, int>;
+template struct SelectedRowsAddTo<platform::CPUPlace, int64_t>;
 
 template <typename T>
 struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
@@ -175,6 +177,8 @@ struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
 
 template struct SelectedRowsAddToTensor<platform::CPUPlace, float>;
 template struct SelectedRowsAddToTensor<platform::CPUPlace, double>;
+template struct SelectedRowsAddToTensor<platform::CPUPlace, int>;
+template struct SelectedRowsAddToTensor<platform::CPUPlace, int64_t>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index 47fe3b44a50fee9f41ae807793187258159b9f29..c1dd323ba29e03e3ab4a3e4d7248388b408fb9d6 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -173,6 +173,8 @@ struct SelectedRowsAddTo<platform::GPUPlace, T> {
 
 template struct SelectedRowsAddTo<platform::GPUPlace, float>;
 template struct SelectedRowsAddTo<platform::GPUPlace, double>;
+template struct SelectedRowsAddTo<platform::GPUPlace, int>;
+template struct SelectedRowsAddTo<platform::GPUPlace, int64_t>;
 
 namespace {
 template <typename T, int block_size>
@@ -223,7 +225,8 @@ struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
 
 template struct SelectedRowsAddToTensor<platform::GPUPlace, float>;
 template struct SelectedRowsAddToTensor<platform::GPUPlace, double>;
-
+template struct SelectedRowsAddToTensor<platform::GPUPlace, int>;
+template struct SelectedRowsAddToTensor<platform::GPUPlace, int64_t>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 09de9dc53a1de9537b5109b3cc7cf9744f9c7908..7de9291c17d3f09a3c6076f00f2457f240e6f0af 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  CopyFrom(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   Tensor tensor2_cpu;
-  tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx);
+  CopyFrom(*tensor2, cpu_place, ctx, &tensor2_cpu);
   ctx.Wait();
 
   auto* tensor2_cpu_data = tensor2_cpu.data<float>();
@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  CopyFrom(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   Tensor tensor1_cpu;
-  tensor1_cpu.CopyFrom(*tensor1, cpu_place, ctx);
+  CopyFrom(*tensor1, cpu_place, ctx, &tensor1_cpu);
   ctx.Wait();
 
   auto* tensor1_cpu_data = tensor1_cpu.data<float>();
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b57d3dc1414cff492db8d7d503a7fce370a3f151
--- /dev/null
+++ b/paddle/operators/math/unpooling.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..37c3c8b689f9a69b68ddffd23813fa9ad8ced0e7
--- /dev/null
+++ b/paddle/operators/math/unpooling.cu
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_grad_data, output_height,
+                              output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..7077d7c2274fd9e02b69ef343f310f4ffbbcff1a
--- /dev/null
+++ b/paddle/operators/math/unpooling.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename Place, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename Place, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
index cbc30bd754608dd6e6def1a4097d69bdf0c942c3..dc64d1d9776261541a380ed15207904d6b4e641c 100644
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index c31c716842f30de67c29b803866b8c82ddcf4a41..62c3152304ad7fe946c996be413e102f3dd92bb2 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -82,7 +82,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
   output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                               output_depth, output_height, output_width},
@@ -96,7 +96,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
-    output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -110,7 +110,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
   paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
@@ -120,7 +120,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
 
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..798022c9dd904a0ac189b4b550a94264a433ebf2
--- /dev/null
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class MaxSeqenceLenOp : public framework::OperatorBase {
+ public:
+  MaxSeqenceLenOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    int64_t *out_ptr = out->mutable_data<int64_t>({1}, platform::CPUPlace());
+    *out_ptr = rank_table.items()[0].length;
+  }
+};
+
+class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxSeqenceLenOpProtoMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RankTable", "The lod_rank_table.");
+    AddOutput("Out", "The max sequence length.");
+    AddComment(
+        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+  }
+};
+
+class MaxSeqenceLenInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp,
+                  paddle::operators::MaxSeqenceLenOpProtoMaker,
+                  paddle::operators::MaxSeqenceLenInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e203a25d544372220e8246e5e17ffbc6408d2998
--- /dev/null
+++ b/paddle/operators/maxout_op.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/operators/maxout_op.h"
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of maxout operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of maxout operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<int>(
+        "groups",
+        R"DOC("Specifies how many groups the input tensor will be split"
+        "in the channel dimension. And the number of output channel is "
+        "the number of channels divided by groups.."
+        )DOC");
+    AddComment(R"DOC(
+        Assumed the input shape is (N, Ci, H, W).
+        The output shape is (N, Co, H, W). Then `Co = Ci / groups`.
+
+       math:
+       y_{si+j} = \max_k x_{gsi + sk + j}
+       g = groups
+       s = input.size / num_channels
+       0 \le i < num_channels / groups
+       0 \le j < s
+       0 \le k < groups
+
+    Please refer to Paper:
+      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+      - Multi-digit Number Recognition from Street View \
+        Imagery using Deep Convolutional Neural Networks: \
+        https://arxiv.org/pdf/1312.6082v4.pdf
+        )DOC");
+  }
+};
+
+class MaxOutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxoutOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MaxoutOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int groups = ctx->Attrs().Get<int>("groups");
+    // check groups > 1
+    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
+    output_shape.push_back(in_x_dims[2]);
+    output_shape.push_back(in_x_dims[3]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class MaxOutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
+            ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(maxout,
+                       ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..decd43913d69d122330886e07178778d03f7fef5
--- /dev/null
+++ b/paddle/operators/maxout_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/maxout_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(maxout,
+                       ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
+                       ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
+    ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..44a0d073dda642f6e261ce5760013f3e1055f43d
--- /dev/null
+++ b/paddle/operators/maxout_op.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class MaxOutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    int groups = context.template Attr<int>("groups");
+
+    math::MaxOutFunctor<Place, T> maxout_forward;
+    maxout_forward(context.device_context(), *in_x, out, groups);
+  }
+};
+
+template <typename Place, typename T>
+class MaxOutGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int groups = context.template Attr<int>("groups");
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
+      math::MaxOutGradFunctor<Place, T> maxout_backward;
+      maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
+                      *out_grad, groups);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index 80460c476921b63ec5228a9780880c7db3c85217..adc688dbd5e13a2203d6842a12acdb8625288275 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -45,7 +45,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+      framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
       PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -99,8 +99,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       if (len == 0) {
         continue;
       }
-      out->Slice(out_offset, out_offset + len)
-          .CopyFrom(input->Slice(start_offset, end_offset), place, dev_ctx);
+      auto slice = out->Slice(out_offset, out_offset + len);
+      framework::CopyFrom(input->Slice(start_offset, end_offset), place,
+                          dev_ctx, &slice);
       out_offset += len;
       (*in_idx) += 1;
     }
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 49ed8a8879527fd32dd8b001ea256e46a0353487..10dff8d021d0394702cc8b92e779c012a4cf3eb2 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
+    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
     Place place = boost::get<Place>(ctx.GetPlace());
@@ -68,7 +68,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
+    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 66fcc09bc877867e66a37adc73230d8dabf4cbed..22a37ff1bbf6b8cfb2cbc3c3dbbb20a87c5ea4e7 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -49,7 +49,7 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
     AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index 56ba57854955c08031214d1f751c17fbb8bb882c..bb7ae20286dd8e52f72b79cbf353bd812a2cc092 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -97,7 +97,7 @@ class NCCLTester : public ::testing::Test {
       send_tensor->mutable_data<T>(kDims, place);
 
       std::vector<T> send_vector(f::product(kDims), gpu_id);
-      send_tensor->CopyFromVector<T>(send_vector, *ctx);
+      paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
       ctx->Wait();
       VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
     }
diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc
index f962d9e3e6abde14ce21eb0102f10d139fdb160e..be9fcc5661f420aadf908cf80cce6c963008b0e4 100644
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
@@ -20,6 +20,18 @@ REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad,
             ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(pool2d_cudnn,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+
+REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool3d_cudnn,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc
index 8711567b95fea355396173b5312d26d31f9ffb12..66dd194ccd5ed629c5861552a7c124dc911362d7 100644
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -52,7 +52,13 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_desc;
     ScopedPoolingDescriptor pool_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
@@ -112,7 +118,13 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_desc;
     ScopedPoolingDescriptor pool_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
@@ -135,8 +147,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<paddle::platform::GPUPlace, T> set_zero;
-      set_zero(ctx.device_context(), input_grad, static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset input_grad.
 
       PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
@@ -151,5 +162,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>);
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>,
+                       ops::PoolCudnnOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
+                       ops::PoolCudnnGradOpKernel<double>);
+
+REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel<float>,
+                       ops::PoolCudnnOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
+                       ops::PoolCudnnGradOpKernel<double>);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index f3963b1995ef8767786f0bf230b134afc69aa99d..e26ffd86e5b5645e361070ca9fd9d8dc49d1ed30 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
+      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
       "operator."
       "If global_pooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -122,15 +122,15 @@ Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
-Example:
+Example:   
   Input:
        X shape: $(N, C, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, H_{out}, W_{out})$
-  where 
+  Where
        $$ 
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
 
 )DOC");
@@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                                // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
+      "(vector<int>, default {0,0,0}), paddings(depth, height, "
       "width) of pooling operator. "
       "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -199,12 +199,12 @@ Example:
        X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
-       $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
-       $$
+  Where
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+  $$
 
 )DOC");
 }
@@ -217,14 +217,18 @@ REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
             ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
 
 REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
             ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>);
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc
index 0e3b80868f7b9d1697d619889160856d65ad59a3..1010cb762289dd39cd632c699f7528f4ba638278 100644
--- a/paddle/operators/pool_op.cu.cc
+++ b/paddle/operators/pool_op.cu.cc
@@ -17,11 +17,15 @@ limitations under the License. */
 namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
 
 REGISTER_OP_GPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 1df36e965abab3549aeb88bf682b712033c4d79c..b9c42a69128a26ff5942748e11fb87c57d3e3f58 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -29,11 +29,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "X(Input) of Pooling should not be null.");
+                   "Input(X) of Pooling should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Out(Output) of Pooling should not be null.");
+                   "Output(Out) of Pooling should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Mask"),
-                   "Mask(Output) of Pooling should not be null.");
+                   "Output(Mask) of Pooling should not be null.");
 
     auto in_x_dims = ctx->GetInputDim("X");
 
@@ -67,6 +67,14 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
     ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
   }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
@@ -80,6 +88,14 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
                    "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,7 +132,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
         "global_pooling",
-        "(bool, default false) Whether to use the global pooling. "
+        "(bool, default:false) Whether to use the global pooling. "
         "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
@@ -126,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector<int>, defalut {0, 0}), paddings(height, width) of pooling "
+        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
         "operator. "
         "If global_pooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -150,10 +166,10 @@ Example:
   Output:
        Out shape: $(N, C, H_{out}, W_{out})$
        Mask shape: $(N, C, H_{out}, W_{out})$
-  where
+  Where
        $$
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
 
 )DOC");
@@ -204,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector, defalut {0,0,0}), paddings(depth, "
+        "(vector, default {0,0,0}), paddings(depth, "
         "height, width) of pooling operator. "
         "If global_pooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -228,11 +244,11 @@ Example:
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
        Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
+  Where
        $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
        $$
 
 )DOC");
@@ -250,10 +266,12 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
 
 REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
             ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
@@ -261,7 +279,9 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc
index 287657d4b1c57f354ef050885f71261092bdc062..335064a7eea4ec15c529db5254cbb026ba575f3d 100644
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/operators/pool_with_index_op.cu.cc
@@ -18,14 +18,18 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
 REGISTER_OP_GPU_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
 
 REGISTER_OP_GPU_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
 REGISTER_OP_GPU_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index a081607edce335f0265388ab01238d584bcf3ead..40766c7e821e8b85aeda9473798a1f696d0ad719 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -24,8 +24,8 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
+template <typename Place, typename T1, typename T2>
+class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* in_x = context.Input<Tensor>("X");
@@ -44,13 +44,13 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
 
     switch (ksize.size()) {
       case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
+        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T1, T2>
             pool2d_forward;
         pool2d_forward(context.device_context(), *in_x, ksize, strides,
                        paddings, out, mask);
       } break;
       case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
+        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T1, T2>
             pool3d_forward;
         pool3d_forward(context.device_context(), *in_x, ksize, strides,
                        paddings, out, mask);
@@ -60,8 +60,8 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
-class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
+template <typename Place, typename T1, typename T2>
+class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* mask = context.Input<Tensor>("Mask");
@@ -80,19 +80,19 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
     }
 
     if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
+      in_x_grad->mutable_data<T1>(context.GetPlace());
       auto& device_ctx = context.device_context();
       math::set_constant(device_ctx, in_x_grad, 0);
 
       switch (ksize.size()) {
         case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T1, T2>
               pool2d_backward;
           pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, in_x_grad);
         } break;
         case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T1, T2>
               pool3d_backward;
           pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, in_x_grad);
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 0075ccd24271bf83f139e121efad00c2316cc11b..c976e22c7740ad11279ab5ee75e4d130be8fa0c5 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -284,7 +284,8 @@ class RecurrentOp : public RecurrentBase {
             auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
             // Explicit copy output since the local RNN scope can be destroyed
             // early.
-            dst_out.CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx);
+            framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx,
+                                &dst_out);
           });
 
       scopes.Next();
@@ -365,7 +366,8 @@ class RecurrentGradOp : public RecurrentBase {
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          cur_grad_tensor->CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx);
+          framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx,
+                              cur_grad_tensor);
         }
       }
 
@@ -401,7 +403,7 @@ class RecurrentGradOp : public RecurrentBase {
             auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
                                       ->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["data_type"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
@@ -438,7 +440,7 @@ class RecurrentGradOp : public RecurrentBase {
             }
 
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            dst.CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+            framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst);
           });
       VLOG(5) << "Link outside gradient finished ";
 
@@ -451,7 +453,7 @@ class RecurrentGradOp : public RecurrentBase {
                 framework::LoDTensor *outside) {
               outside->Resize(inside.dims());
               outside->mutable_data(dev_ctx.GetPlace(), inside.type());
-              outside->CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+              framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside);
             });
         VLOG(5) << "Link initialize state gradient finished ";
       }
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c69e416e10f2a9ced1f1b22c39235e4c9338e77c
--- /dev/null
+++ b/paddle/operators/recv_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+void RunServer(Server **rpc_server,
+               std::shared_ptr<detail::SendRecvServerImpl> service,
+               const std::string &server_address) {
+  ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  *rpc_server = server.get();
+  LOG(INFO) << "Server listening on " << server_address << std::endl;
+  server->Wait();
+}
+
+class RecvOp : public framework::OperatorBase {
+ public:
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      rpc_service_.reset(new detail::SendRecvServerImpl());
+      std::string endpoint = Attr<std::string>("endpoint");
+      server_thread_.reset(
+          new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
+    }
+  }
+
+  virtual ~RecvOp() {
+    rpc_server_->Shutdown();
+    server_thread_->join();
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    // blocking get one var from client.
+    const framework::LoDTensor &t = rpc_service_->Get();
+    framework::Scope &recv_scope = scope.NewScope();
+    // set graph input var
+    auto *var = recv_scope.Var(Input("RX"));
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    // FIXME(typhoonzero): do not copy
+    framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
+
+    auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
+    auto *program = block->Program();
+    framework::Executor executor(dev_ctx);
+    // Run sub graph to get optimized tensor
+    executor.Run(*program, &recv_scope, block->ID(),
+                 false /*create_local_scope*/);
+
+    auto *out_var = recv_scope.FindVar("Out");
+    // push back
+    rpc_service_->Push(out_var->Get<framework::LoDTensor>());
+  }
+
+ protected:
+  // grpc server instance to track status and gracefully shutdown.
+  // borrow an pointer from server thread.
+  Server *rpc_server_{nullptr};
+  // grpc send/recv service implement to register.
+  std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+};
+
+class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RX", "(Tensor) Input tensor to be saved");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
+                                        "optimize network run in server");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index beb951713ae2a9fd83fe7c1a5e97ee8c642158a8..0e98c8b4f443f88ecba044f2f79228227695e182 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto out_dims = out->dims();
     out->mutable_data<T>(ctx.GetPlace());
-    out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
+    framework::CopyFrom(*in, ctx.GetPlace(), ctx.device_context(), out);
     out->Resize(out_dims);
   }
 };
@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto in_dims = d_x->dims();
-    d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context());
+    framework::CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
     d_x->Resize(in_dims);
   }
 };
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
deleted file mode 100644
index ee61ea300c33722471189d06eb09f67a083d2a4d..0000000000000000000000000000000000000000
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/rnn/recurrent_op_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace rnn {
-
-namespace f = paddle::framework;
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& inlinks,
-                   const size_t seq_len) {
-  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
-  for (size_t i = 0; i < inlinks.size(); ++i) {
-    // global inputs
-    auto input_var = step_scopes[0]->parent().FindVar(inlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(input_var, "input link [%s] is not in scope.",
-                            inlinks[i]);
-
-    LoDTensor* input = input_var->GetMutable<LoDTensor>();
-    f::DDim dims = input->dims();
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(dims[0]), seq_len,
-                      "all the inputs be the same length");
-    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
-    for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_input =
-          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
-      // The input of operators of each step is Tensor here.
-      // Maybe need to modify Slice function.
-      *step_input = input->Slice(j, j + 1);
-      step_input->Resize(step_dims);
-    }
-  }
-}
-
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& outlinks,
-                   const size_t seq_len, const platform::DeviceContext& ctx) {
-  for (size_t i = 0; i < outlinks.size(); i++) {
-    auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
-                            outlinks[i]);
-    LoDTensor* output = output_var->GetMutable<LoDTensor>();
-
-    auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
-    f::DDim step_dims =
-        step_scope_var->template GetMutable<LoDTensor>()->dims();
-    std::vector<int64_t> dims_vec = vectorize(step_dims);
-    dims_vec.insert(dims_vec.begin(), seq_len);
-    output->Resize(f::make_ddim(dims_vec));
-    output->mutable_data<float>(platform::CPUPlace());
-    for (size_t j = 0; j < seq_len; j++) {
-      LoDTensor* step_output =
-          step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
-      // TODO(luotao02) data type and platform::DeviceContext() should set
-      // correctly
-      (output->Slice(j, j + 1))
-          .CopyFrom(*step_output, platform::CPUPlace(), ctx);
-    }
-  }
-}
-
-void LinkMemories(const std::vector<Scope*>& scopes,
-                  const std::vector<rnn::StateAttr>& memories,
-                  const size_t step_id, const int offset) {
-  PADDLE_ENFORCE_LT(step_id, scopes.size(),
-                    "step [%d] is out of range of step scopes' size [%d]",
-                    step_id, scopes.size());
-  PADDLE_ENFORCE_GE(static_cast<int>(step_id) + offset, 0,
-                    "offset [%d] must be large than -[%d]", offset, step_id);
-  PADDLE_ENFORCE_LT(
-      step_id + offset, scopes.size(),
-      "offset [%d] is out of range, it must be less than (%d - %d)", offset,
-      scopes.size(), step_id);
-  auto* scope = scopes[step_id];
-  auto* linked_scope = scopes[step_id + offset];
-  for (auto& attr : memories) {
-    auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
-    auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
-    mem->Resize(linked_mem->dims());
-    mem->ShareDataWith(*linked_mem);
-  }
-}
-
-void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op, bool is_grad) {
-  arg->step_scopes =
-      is_grad ? op.Input(name.step_scopes) : op.Output(name.step_scopes);
-  arg->inlinks = op.Inputs(name.inlinks);
-  arg->outlinks = op.Outputs(name.outlinks);
-
-  auto& boot_memories = is_grad ? op.Outputs(name.initial_states)
-                                : op.Inputs(name.initial_states);
-  // attributes
-  auto& memories = op.Attr<std::vector<std::string>>(name.states);
-  auto& pre_memories = op.Attr<std::vector<std::string>>(name.ex_states);
-
-  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
-                 "the size of states, initial_states don't match:%d,%d",
-                 memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
-                 "the size of ex_states, initial_states don't match:%d,%d",
-                 pre_memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set");
-
-  for (size_t i = 0; i < memories.size(); ++i) {
-    rnn::StateAttr mem_attr;
-    mem_attr.var = memories[i];
-    mem_attr.pre_var = pre_memories[i];
-    mem_attr.boot_var = boot_memories[i];
-    (arg->states).push_back(mem_attr);
-  }
-}
-
-}  // namespace rnn
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
deleted file mode 100644
index fb0e158e07745d58c6211d33e385b324e492b95e..0000000000000000000000000000000000000000
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-namespace rnn {
-
-using Scope = framework::Scope;
-
-/**
- * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
- *
- * Memory attributes cached by this op, dims will be infered from
- * boot memories in father scope. Other attributes are copied from Op's proto
- * attributes.
- */
-struct StateAttr {
-  // name of current state variable
-  std::string var;
-  // name of previous step's state variable
-  std::string pre_var;
-  // name of the variables to init this memory (same role of `boot_layer` in
-  // PaddlePaddle), which is store in father's scope.
-  std::string boot_var;
-};
-
-struct Argument {
-  std::string step_net;
-  std::string step_scopes;
-  std::vector<std::string> inlinks;
-  std::vector<std::string> outlinks;
-  std::vector<rnn::StateAttr> states;
-};
-
-struct ArgumentName {
-  std::string step_net;
-  std::string step_scopes;
-  std::string inlinks;
-  std::string outlinks;
-  std::string states;          // the memory name
-  std::string ex_states;       // the previous memory name
-  std::string initial_states;  // the boot memory name
-};
-
-/**
- * Prepare inputs for each step net.
- */
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& inlinks,
-                   const size_t seq_len);
-
-/**
- * Process outputs of step nets and merge to variables.
- */
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& outlinks,
-                   const size_t seq_len, const platform::DeviceContext& ctx);
-
-void LinkMemories(const std::vector<Scope*>& step_scopes,
-                  const std::vector<StateAttr>& memories, const size_t step_id,
-                  const int offset);
-
-void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op, bool is_grad = false);
-
-}  // namespace rnn
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
index b621c7f1ba3f9e9613dea5bc98ef74c7c6dae9a0..3a035f0b9acb94bab60659938e11b4996b8eaa0f 100644
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -62,7 +62,7 @@ class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "");
     AddOutput("Out", "");
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
@@ -95,7 +95,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
       auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
 
       framework::AttributeMap attrs;
-      attrs["data_type"] = framework::ToDataType(in_var_tensor.type());
+      attrs["dtype"] = framework::ToDataType(in_var_tensor.type());
       attrs["shape"] = framework::vectorize2int(in_var_tensor.dims());
       attrs["value"] = 0.0f;
 
@@ -121,7 +121,7 @@ class RNNMemoryHelperGradOpInfoMaker
     AddInput("X", "");
     AddInput("Out", "");
     AddOutput(framework::GradVarName("X"), "");
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b5e66c96b726a3c1fdb2596a244c5395db85279
--- /dev/null
+++ b/paddle/operators/roi_pool_op.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kROISize = 5;
+
+class ROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
+                   "Output(Argmax) of ROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+    PADDLE_ENFORCE(rois_dims[1] == kROISize,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = input_dims[1];
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputDim("Argmax", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ROIPoolOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor), "
+             "the input of ROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(Tensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D tensor of shape (num_rois, 5)"
+             "given as [[batch_id, x1, y1, x2, y2], …]. "
+             "Where batch_id is the id of the data, "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
+    AddOutput("Out",
+              "(Tensor), "
+              "The output of ROIPoolOp is a 4-D tensor with shape "
+              "(num_rois, channels, pooled_h, pooled_w).");
+    AddOutput("Argmax",
+              "(Tensor), "
+              "Argmaxes corresponding to indices in X used "
+              "for gradient computation. Only output "
+              "if arg “is_test” is false.")
+        .AsIntermediate();
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "The pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "The pooled output width.")
+        .SetDefault(1);
+    AddComment(R"DOC(
+ROIPool operator
+
+ROI Pooling for Faster-RCNN. The link below is a further introduction: 
+https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+            ops::ROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool, ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool_grad,
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9a4c8ca752bb7abc4f44d4815743769bc989703a
--- /dev/null
+++ b/paddle/operators/roi_pool_op.cu
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 5;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
+                                  const int64_t* input_rois,
+                                  const float spatial_scale, const int channels,
+                                  const int height, const int width,
+                                  const int pooled_height,
+                                  const int pooled_width, T* output_data,
+                                  int64_t* argmax_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GPUROIPoolBackward(
+    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      platform::CudaAtomicAdd(
+          offset_input_grad + argmax,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* argmax = ctx.Output<Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    auto in_stride = framework::stride(in_dims);
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    size_t rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    GPUROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
+        channels, height, width, pooled_height, pooled_width,
+        out->mutable_data<T>(ctx.GetPlace()),
+        argmax->mutable_data<int64_t>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* argmax = ctx.Input<Tensor>("Argmax");
+
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    size_t rois_num = rois->dims()[0];
+    int channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+
+      int output_grad_size = out_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
+            width, pooled_height, pooled_width,
+            x_grad->mutable_data<T>(ctx.GetPlace()));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    roi_pool, ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    roi_pool_grad,
+    ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3812c66c65457b9d1337690d1a82759aab9a9732
--- /dev/null
+++ b/paddle/operators/roi_pool_op.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class CPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto argmax_stride = framework::stride(argmax->dims());
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+    const int64_t* rois_data = rois->data<int64_t>();
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      PADDLE_ENFORCE_GE(roi_batch_id, 0);
+      PADDLE_ENFORCE_LT(roi_batch_id, batch_size);
+      rois_data += roi_stride[0];
+    }
+
+    rois_data = rois->data<int64_t>();
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      int roi_start_w = round(rois_data[1] * spatial_scale);
+      int roi_start_h = round(rois_data[2] * spatial_scale);
+      int roi_end_w = round(rois_data[3] * spatial_scale);
+      int roi_end_h = round(rois_data[4] * spatial_scale);
+
+      // Force malformed ROIs to be 1x1
+      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
+
+      const float bin_size_h =
+          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+      const float bin_size_w =
+          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+      const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            //  Compute pooling region for this output unit:
+            //  start (included) = floor(ph * roi_height / pooled_height_)
+            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+            int hstart =
+                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+            int wstart =
+                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+            int hend =
+                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+            int wend =
+                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
+            hend = std::min(std::max(hend + roi_start_h, 0), height);
+            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
+            wend = std::min(std::max(wend + roi_start_w, 0), width);
+
+            const int pool_index = ph * pooled_width + pw;
+
+            // Define an empty pooling region to be zero
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            output_data[pool_index] =
+                is_empty ? 0 : -std::numeric_limits<T>::max();
+            argmax_data[pool_index] = -1;
+
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width + w;
+                if (batch_data[index] > output_data[pool_index]) {
+                  output_data[pool_index] = batch_data[index];
+                  argmax_data[pool_index] = index;
+                }
+              }
+            }
+          }
+        }
+
+        batch_data += in_stride[1];
+        output_data += out_stride[1];
+        argmax_data += argmax_stride[1];
+      }
+      // Increment ROI data pointer
+      rois_data += roi_stride[0];
+    }
+    return;
+  }
+};
+
+template <typename Place, typename T>
+class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+
+    if (in_grad) {
+      const int64_t* rois_data = rois->data<int64_t>();
+      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* argmax_data = argmax->data<int64_t>();
+      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
+
+      auto in_stride = framework::stride(in->dims());
+      auto argmax_stride = framework::stride(argmax->dims());
+      auto roi_stride = framework::stride(rois->dims());
+      auto out_stride = framework::stride(out_grad->dims());
+
+      int rois_num = rois->dims()[0];
+      int channels = in->dims()[1];
+
+      for (int n = 0; n < rois_num; ++n) {
+        int roi_batch_idx = rois_data[0];
+        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int pool_index = ph * pooled_width + pw;
+              if (argmax_data[pool_index] >= 0) {
+                auto index = argmax_data[pool_index];
+                batch_grad_data[index] += out_grad_data[pool_index];
+              }
+            }
+          }
+          batch_grad_data += in_stride[1];
+          out_grad_data += out_stride[1];
+          argmax_data += argmax_stride[1];
+        }
+        rois_data += roi_stride[0];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index 56909fb65f44ad00314103e21bee9535fbd59317..d4921cb80c8d78c52ae1887c36819b52621470eb 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase {
                    "SaveOp only support LoDTensor, %s has wrong type", iname);
 
     auto &tensor = var->Get<framework::LoDTensor>();
-
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-       // int32_t  size
-       // void*    protobuf message
-      framework::TensorDesc desc;
-      desc.set_data_type(framework::ToDataType(tensor.type()));
-      auto dims = framework::vectorize(tensor.dims());
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(dims.size()), 0);
-      std::copy(dims.begin(), dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      fout.write(out.data(), size);
-    }
-    {  // the 3rd field, tensor data
-      uint64_t size = tensor.memory_size();
-      auto *data_ptr = tensor.data<void>();
-      PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                     "Index overflow when writing tensor");
-      if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-        constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-        std::unique_ptr<char[]> buf(new char[kBufSize]);
-        auto &gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-        platform::CPUPlace cpu;
-        uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-        while (size != 0) {
-          size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-          memory::Copy(cpu, buf.get(),
-                       boost::get<platform::GPUPlace>(tensor.place()),
-                       reinterpret_cast<const void *>(data), size_to_write,
-                       gpu_dev_ctx.stream());
-          gpu_dev_ctx.Wait();
-          fout.write(buf.get(), size_to_write);
-          data += size_to_write;
-          size -= size_to_write;
-        }
-#else
-        PADDLE_THROW("Unexpected branch");
-#endif
-      } else {
-        fout.write(static_cast<const char *>(data_ptr),
-                   static_cast<std::streamsize>(size));
-      }
-    }
-    {  // the 4th field, lod information
-       // uint64_t lod_level
-       // uint64_t lod_level_1 size in byte.
-       // int*     lod_level_1 data
-       // ...
-      auto lod = tensor.lod();
-      uint64_t size = lod.size();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-      for (auto &each : lod) {
-        size = each.size() * sizeof(framework::LoD::value_type::value_type);
-        fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-        fout.write(reinterpret_cast<const char *>(each.data()),
-                   static_cast<std::streamsize>(size));
-      }
-    }
+    framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
 
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 5745580504fb9bda551f21665bff5c65ae82aeb9..e5c10fec4d840c58a74758a65ddfa93421ab4827 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                   ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>);
+                       ops::ScaleKernel<paddle::platform::CPUPlace, double>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 820fd4e6855bb192ec3292ea6983d5ecae73b6e6..0d707751598e65bc56bf73a435c10b4acd6d8ed0 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -16,4 +16,6 @@
 
 REGISTER_OP_GPU_KERNEL(
     scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3059847f2d420359b347e3a5d514d8a3829a4e2
--- /dev/null
+++ b/paddle/operators/send_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(typhoonzero): this is a simple implementation which only send
+// one tensor
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    // init client when the operator is created at runtime.
+    if (!client_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      client_.reset(new detail::RPCClient(
+          grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
+      // TODO(typhoonzero): how to call InitVariables
+    }
+  }
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto iname = Input("X");
+    auto oname = Output("Out");
+    // TODO(typhoonzero): currently it's non-blocking,
+    // should block until server responds.
+    bool ret = client_->SendVariable(scope, iname, oname);
+    if (!ret) {
+      LOG(ERROR) << "send variable error";
+    }
+  }
+
+ protected:
+  std::shared_ptr<detail::RPCClient> client_{nullptr};
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor to be saved");
+    AddOutput("Out", "(Tensor) Output fetched from server");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac03eb3752e7cd31dd80f4caa39dc0625f0409d5
--- /dev/null
+++ b/paddle/operators/send_recv_op_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+// TODO(typhoonzero): add python bindings for this test as
+// a RemoteOptimizer.
+
+#include <unistd.h>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(send);
+USE_NO_KERNEL_OP(recv);
+USE_OP(sum);
+
+// global for simplicity.
+std::unique_ptr<paddle::framework::OperatorBase> recv_op;
+
+void InitTensorsInScope(paddle::framework::Scope &scope,
+                        paddle::platform::CPUPlace &place) {
+  paddle::platform::CPUDeviceContext ctx(place);
+  auto var = scope.Var("X");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({10, 10});
+  float *expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(i);
+  }
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  tensor->mutable_data<float>(place);  // allocate
+}
+
+void AddOp(const std::string &type,
+           const paddle::framework::VariableNameMap &inputs,
+           const paddle::framework::VariableNameMap &outputs,
+           paddle::framework::AttributeMap attrs,
+           paddle::framework::BlockDescBind *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+void StartServerNet() {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  // sub program run in recv_op, for simple test we use sum
+  paddle::framework::ProgramDescBind program;
+  paddle::framework::BlockDescBind *block = program.MutableBlock(0);
+  // X for server side tensors, RX for received tensers, must be of same shape.
+  AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"OptimizeBlock", block});
+  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
+                                                    {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  recv_op->Run(scope, ctx);
+}
+
+TEST(SendRecvOp, CPU) {
+  std::thread server_thread(StartServerNet);
+  sleep(5);  // wait server to start
+  // local net
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+
+  auto send_op = paddle::framework::OpRegistry::CreateOp(
+      "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  send_op->Run(scope, ctx);
+
+  auto in_var = scope.Var("X");
+  auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
+  float *expected = tensor->data<float>();
+
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
+  // send fail cause output is none.
+  EXPECT_NE(target->memory_size(), size_t(0));
+  float *actual = target->data<float>();
+  for (int64_t i = 0; i < target->numel(); ++i) {
+    EXPECT_EQ(expected[i] * 2, actual[i]);
+  }
+  recv_op.reset();  // dtor can shutdown and join server thread.
+  server_thread.join();
+}
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index 41cadce4c603a9c14db79e2f6b30f8664cf72a38..c5533732d44737bb8cc71fd8ac46f3c36c72ada1 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -179,7 +179,9 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
             sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>);
+    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>,
+    ops::SequenceConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc
index 6106b0e46c0ab96e01dfc344055f23dbf4a1a2c3..c8136dbcb35be4f1236dddc3d24546f9d91670c8 100644
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
@@ -16,7 +16,9 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>);
+    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>,
+    ops::SequenceConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..255683a572c0e8d54791cb0c905d85239920d992
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceSliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Offset"),
+                   "Input(Offset) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Length"),
+                   "Input(Length) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSliceOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+
+    auto offset_dim = ctx->GetInputDim("Offset");
+    auto length_dim = ctx->GetInputDim("Length");
+
+    PADDLE_ENFORCE_EQ(
+        offset_dim.size(), 2UL,
+        "Only support one level sequence now, The rank of offset must be 2.");
+    PADDLE_ENFORCE_EQ(
+        length_dim.size(), 2UL,
+        "Only support one level sequence now, The rank of Length must be 2.");
+
+    // Initialize the output's dims to maximum,
+    // and re-set to real dims by the value of Offset and Length at kernel
+    ctx->SetOutputDim("Out", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequenceSliceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSliceOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), "
+             "the input of SequenceSliceOp.");
+    AddInput("Offset",
+             "(Tensor), "
+             "a vector<int> to describe the offset of every input sequence for "
+             "sub sequence item.");
+    AddInput("Length",
+             "(Tensor), "
+             "a vector<int> to describe the length of every input sequence for "
+             "sub sequence item.");
+    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
+    AddComment(R"DOC(
+Sequence slice operator
+
+The operator crops a subsequence from given sequence with given start offset and subsequence length.
+It only supports sequence (LoD Tensor with level number is 1).
+- Case:
+    X = [[a1, a2;
+        b1, b2;
+        c1, c2]
+       [d1, d2;
+        e1, e2]]
+    LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2)
+    Offset = [[0], [1]]; Length = [[2], [1]]
+
+    Out = [[a1, a2;
+            b1, b2]
+            [e1, e2]]
+    LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2)
+NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
+            sequence_slice_grad, ops::SequenceSliceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu
new file mode 100755
index 0000000000000000000000000000000000000000..a9f59dadba74d900fa5cc0601fb5b264ea19e34d
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_slice_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6411e0a46630beb0a9abb6aa5e517978b25a5254
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename T>
+inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
+                            const int64_t* length_data) {
+  auto out_lod = in.lod();
+  size_t lod_offset = 0;
+
+  auto n = in.lod()[0].size() - 1;
+  out_lod[0][0] = 0;
+  for (size_t i = 0; i < n; ++i) {
+    lod_offset += length_data[i];
+    out_lod[0][i + 1] = lod_offset;
+  }
+  return out_lod;
+}
+
+template <typename Place, typename T>
+class SequenceSliceOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    auto n = lod[0].size() - 1;
+
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(
+        n, static_cast<size_t>(length->dims()[0]),
+        "The size of input-sequence and length-array should be the same")
+    PADDLE_ENFORCE_EQ(
+        n, static_cast<size_t>(offset->dims()[0]),
+        "The size of input-sequence and offset-array should be the same")
+
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+    framework::Tensor offset_cpu;
+    framework::Tensor length_cpu;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(),
+                          &offset_cpu);
+      offset_data = offset_cpu.data<int64_t>();
+
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(),
+                          &length_cpu);
+      length_data = length_cpu.data<int64_t>();
+    }
+
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_LT(0, offset_data[i],
+                        "The offset[%d] must greater than zero.", i)
+      PADDLE_ENFORCE_LT(0, length_data[i],
+                        "The length[%d] must greater than zero.", i)
+      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+                        lod[0][i + 1], "The target tensor's length overflow.")
+    }
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
+    auto out_dims = in->dims();
+    out_dims[0] = out_lod[0][out_lod[0].size() - 1];
+    out->Resize(out_dims);
+    out->set_lod(out_lod);
+
+    auto in_stride = framework::stride(in->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    size_t out_offset = 0;
+    for (size_t i = 0; i < n; ++i) {
+      Tensor in_t = in->Slice(
+          static_cast<int>(lod[0][i] + offset_data[i]),
+          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                       in_t.dims(), out_stride, out->data<T>() + out_offset);
+      out_offset += length_data[i] * in_stride[0];
+    }
+  }
+};
+
+template <typename Place, typename T>
+class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+    framework::Tensor offset_cpu;
+    framework::Tensor length_cpu;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(),
+                          &offset_cpu);
+      offset_data = offset_cpu.data<int64_t>();
+
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(),
+                          &length_cpu);
+      length_data = length_cpu.data<int64_t>();
+    }
+
+    auto lod = in->lod();
+    auto out_lod = out_grad->lod();
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      x_grad->set_lod(in->lod());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+
+      auto out_grad_stride = framework::stride(out_grad->dims());
+
+      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
+        Tensor out_grad_t =
+            out_grad->Slice(static_cast<int>(out_lod[0][i]),
+                            static_cast<int>(out_lod[0][i + 1]));
+        auto out_grad_stride = framework::stride(out_grad_t.dims());
+
+        auto x_grad_stride = framework::stride(x_grad->dims());
+
+        Tensor x_grad_t = x_grad->Slice(
+            static_cast<int>(lod[0][i] + offset_data[i]),
+            static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 72f4e4d5cbcd692423fa2a3e9ec8e7033b552c3c..5576d7b8be060a3c58cb18ed667041562cf853b8 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -55,7 +55,7 @@ SGD operator
 
 This operator implements one step of the stochastic gradient descent algorithm.
 
-$$param_out = param - learning_rate * grad$$
+$$param\_out = param - learning\_rate * grad$$
 
 )DOC");
   }
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index 65bccc0c81d0ad9674649933a20ec7b09fec5b37..c380e606869fd2c559c7d5f378857ca74fa8d8d3 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -57,11 +57,21 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
                               framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("RankTable", "");
-    AddInput("I", "");
-    AddOutput("Out", "");
-    AddComment("");
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
+    AddInput("I",
+             "(LoDTensor) The step index. The RNN step memory 'X' will be "
+             "shrinked to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+    AddComment(
+        R"DOC(
+        In dynamic RNN, we are able to handle sequences of different lengths. 
+        Because of the multiple lengths, the size of each step input can be 
+        different, which may lead to a mismatching between the input of
+        the current step and the memory generated by the previous one. This 
+        operator shrinks memory according to the size of the next step input, 
+        to make sure that they can match each other.
+        )DOC");
   }
 };
 
@@ -101,8 +111,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     } else {
       auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
       auto height = dout_tensor.dims()[0];
-      dx_tensor.Slice(0, static_cast<int>(height))
-          .CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx);
+      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+      framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
       if (dx_tensor.dims()[0] < height) {
         auto rest_tensor = dx_tensor.Slice(
             static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0]));
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 93f89e33a73c5f4c6c0e5a8793a0abe7c692b656..93e0525badc26808f0dca70cc1153ac728f1fe9c 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -59,7 +59,7 @@ Then the ratio of the exponential of the given dimension and the sum of
 exponential values of all the other dimensions is the output of the softmax
 operator.
 
-For each row `i` and each column `j` in input X, we have:
+For each row $i$ and each column $j$ in Input(X), we have:
     $$Y[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 3dbb62d2e571eb92025c1b3fc0a6653c7cda007a..fc027d6f95cdbc24af59ef1188b6f16f6a93e85c 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -67,15 +67,15 @@ The equation is as follows:
 
 1) Hard label (one-hot label, so every sample has exactly one class)
 
-$$Loss_j = \f$ -\text{Logit}_{Label_j} +
+$$Loss_j =  -\text{Logit}_{Label_j} +
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1, ..., K $\f$$
+j = 1,..., K$$
 
 2) Soft label (each sample can have a distribution over all classes)
 
-$$Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i -
+$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K $\f$$
+j = 1,...,K$$
 
 )DOC");
   }
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index db635f2ba0804143c9a2e04ff006dfbc8744f3fc..f164a4771186635232fea46327ca1fb8b86f2852 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -49,7 +49,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+      framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
       PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -105,10 +105,11 @@ class SplitLoDTensorOp : public framework::OperatorBase {
           continue;
         }
         // out[offset: offset+len] = x[each_range.begin: each_range.end]
-        out->Slice(static_cast<int>(offset), static_cast<int>(offset + len))
-            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
-                              static_cast<int>(each_range.end)),
-                      x.place(), dev_ctx);
+        auto slice = out->Slice(static_cast<int>(offset),
+                                static_cast<int>(offset + len));
+        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                                    static_cast<int>(each_range.end)),
+                            x.place(), dev_ctx, &slice);
         offset += len;
       }
     }
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index c2b7632b2865a3ef66051d815d7722a08c6a8cbd..ddc210c26e69566fef9baa20f49ba1052e993b3f 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -176,4 +176,6 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                   ops::SumOpVarTypeInference);
 REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>,
-                       ops::SumKernel<paddle::platform::CPUPlace, double>);
+                       ops::SumKernel<paddle::platform::CPUPlace, double>,
+                       ops::SumKernel<paddle::platform::CPUPlace, int>,
+                       ops::SumKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index 5cf05b876b6d6a2ce61d9e10b7ec52ed3cef57d7..5c30dd4d470c2e0acecef18524a4a81f9eb786a9 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -14,4 +14,6 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>,
-                       ops::SumKernel<paddle::platform::GPUPlace, double>);
+                       ops::SumKernel<paddle::platform::GPUPlace, double>,
+                       ops::SumKernel<paddle::platform::GPUPlace, int>,
+                       ops::SumKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 4ca15611392b3117aa6c92cba95911eb8bebeb15..4afec03ecef168077c9964f5cb1da7cd61861f40 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -102,8 +102,8 @@ class SumKernel : public framework::OpKernel<T> {
               out_array.resize(i + 1);
             }
             if (out_array[i].numel() == 0) {
-              out_array[i].CopyFrom(in_array[i], in_array[i].place(),
-                                    context.device_context());
+              framework::CopyFrom(in_array[i], in_array[i].place(),
+                                  context.device_context(), &out_array[i]);
               out_array[i].set_lod(in_array[i].lod());
             } else {
               PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
diff --git a/paddle/operators/tensor.save b/paddle/operators/tensor.save
new file mode 100644
index 0000000000000000000000000000000000000000..c24308a7d0131b84c28c0a9857cce4949afb2091
Binary files /dev/null and b/paddle/operators/tensor.save differ
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index ae1b48d7a8e3d573a5134a822a2ed5ef70511077..ad09fb53ce8c9bf0187e595fe3cdcb6685ab9889 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -38,7 +38,7 @@ class WriteToArrayOp : public ArrayOp {
       out->resize(offset + 1);
     }
     auto *out_tensor = &out->at(offset);
-    out_tensor->CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx);
+    CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
     out_tensor->set_lod(x_tensor.lod());
   }
 };
@@ -116,7 +116,8 @@ class ReadFromArrayOp : public ArrayOp {
     auto *out_tensor = out->GetMutable<framework::LoDTensor>();
     size_t offset = GetOffset(scope, dev_ctx);
     PADDLE_ENFORCE_LT(offset, x_array.size());
-    out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
+    framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
+                        out_tensor);
     out_tensor->set_lod(x_array[offset].lod());
   }
 };
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 7975efc7cf134aaf591385a6866254a9c5f2a0bb..fff1dc7ccddf1d8cee0c8311828fd38888283cd1 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -66,7 +66,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
+        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
@@ -99,7 +99,7 @@ uniform distribution.
                  "Random seed used for generating samples. "
                  "0 means use a seed generated by the system.")
         .SetDefault(0);
-    AddAttr<int>("data_type", "(int, default 5(FP32)) Output tensor data type")
+    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
         .SetDefault(framework::DataType::FP32);
   }
 };
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89c48e071cf351f7d7b9cf26a5d4989af291da57
--- /dev/null
+++ b/paddle/operators/unpool_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(framework::OpProto* proto,
+                  framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+          $$
+        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+        /07/iccv2011.pdf
+        )DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18aafb7dc74ed474ed3ec5e8a388ecdb71b9a8f5
--- /dev/null
+++ b/paddle/operators/unpool_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..243eb7e532c5149db4fb1b381fd8664ae4bdd81a
--- /dev/null
+++ b/paddle/operators/unpool_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    if (output_data) {
+      math::SetConstant<Place, T> set_zero;
+      set_zero(context.device_context(), out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
+    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
+  }
+};
+template <typename Place, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
+                          *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index dcc59f5ff2ae3a8ca999d72a20cfd5c759987d89..68b4f7705995e5ecb6c9b8216db7373c1777a31e 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -180,7 +180,7 @@ class WhileGradOp : public framework::OperatorBase {
           if (var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["data_type"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 8b3be062b654a52e667626199be8c8bb4a2a96d7..1898598e49652a2829e57329bab6017304cec662 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_USE_MKLML
 #pragma omp parallel for
 #endif
   for (size_t i = 0; i < size; ++i) {
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index bd86a9fe268c277065cd450f91b544def6c4d32f..88df28a9668e5f354d115ff8ab32cb21e03aefb5 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,15 +1,20 @@
-cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
+if(WITH_GPU)
+  cc_library(enforce SRCS enforce.cc DEPS nccl)
+else()
+  cc_library(enforce SRCS enforce.cc)
+endif()
+cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
+
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
 
-cc_library(place SRCS place.cc)
+cc_library(place SRCS place.cc DEPS enforce)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
 
-cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
-
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
diff --git a/paddle/platform/cuda_helper.h b/paddle/platform/cuda_helper.h
index a7d99cde106a0a66f122a8c43f49717c03e60dec..376bb0e6887c797c3c1019e92f738a62d01a9c51 100644
--- a/paddle/platform/cuda_helper.h
+++ b/paddle/platform/cuda_helper.h
@@ -31,6 +31,16 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 
 // For atomicAdd.
 USE_CUDA_ATOMIC(Add, float);
+USE_CUDA_ATOMIC(Add, int);
+USE_CUDA_ATOMIC(Add, unsigned int);
+USE_CUDA_ATOMIC(Add, unsigned long long int);
+
+CUDA_ATOMIC_WRAPPER(Add, int64_t) {
+  static_assert(sizeof(int64_t) == sizeof(long long int),
+                "long long should be int64");
+  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
+                       static_cast<unsigned long long int>(val));
+}
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6311cb23d695c3cd851bcca120c24cced7fdd62
--- /dev/null
+++ b/paddle/platform/cuda_profiler.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda_profiler_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace paddle {
+namespace platform {
+
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::vector<std::string> config_flags) {
+  std::array<char, 128> buf;
+  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
+  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
+  memcpy(buf.data(), tmpl.data(), tmpl.size());
+  auto result = mktemp(buf.data());
+  PADDLE_ENFORCE(strlen(result) != 0);
+  std::string config_file = result;
+
+  {
+    std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
+    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
+    for (const auto& line : config_flags) {
+      ofs << line << std::endl;
+    }
+  }
+
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE(
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+}
+
+void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+
+void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
index ce3421a3cb840e4c1e872eea12dedc1150c85962..80a4c9bb4bbcd03cf849d86118db4e502382f031 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -63,9 +63,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
     }                                                             \
   } while (false)
 
-enum class DataLayout {
+enum class DataLayout {  // Not use
   kNHWC,
   kNCHW,
+  kNCDHW,
   kNCHW_VECT_C,
 };
 
@@ -107,12 +108,15 @@ class CudnnDataType<double> {
   }
 };
 
-inline cudnnTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
+inline cudnnTensorFormat_t GetCudnnTensorFormat(
+    const DataLayout& order) {  // Not use
   switch (order) {
     case DataLayout::kNHWC:
       return CUDNN_TENSOR_NHWC;
     case DataLayout::kNCHW:
       return CUDNN_TENSOR_NCHW;
+    case DataLayout::kNCDHW:
+      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
     default:
       PADDLE_THROW("Unknown cudnn equivalent for order");
   }
@@ -139,7 +143,7 @@ class ScopedTensorDescriptor {
       strides[i] = dims[i + 1] * strides[i + 1];
     }
     // Update tensor descriptor dims setting if groups > 1
-    // FIXME(typhoonzero): Assume using NCHW order
+    // NOTE: Assume using NCHW or NCDHW order
     std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
@@ -176,12 +180,12 @@ class ScopedFilterDescriptor {
                                             const cudnnDataType_t type,
                                             const std::vector<int>& kernel,
                                             const int groups = 1) {
-    // filter layout: MCHW, where M is the number of
+    // filter layout: MCHW(MCDHW), where M is the number of
     // output image channels, C is the number of input image channels,
-    // H and W is height and width of filter.
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
     std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
     if (groups > 1) {
-      // M /= groups
       kernel_with_group[0] /= groups;
       // NOTE: input filter(C) of the filter is already asserted to be C/groups.
     }
@@ -219,13 +223,15 @@ class ScopedConvolutionDescriptor {
     PADDLE_ENFORCE_EQ(pads.size(), strides.size());
     PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
 
-#if CUDNN_VERSION < 6000
+#if !CUDNN_VERSION_MIN(6, 0, 0)
     // cudnn v5 does not support dilation conv, the argument is called upscale
     // instead of dilations and it is must be one.
     for (size_t i = 0; i < dilations.size(); ++i) {
       PADDLE_ENFORCE_EQ(
           dilations[i], 1,
-          "Dilations conv is not supported in this cuDNN version");
+          "Dilations conv is not supported in this cuDNN version(%d.%d.%d).",
+          CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
+          CUDNN_VERSION % 100);
     }
 #endif
 
diff --git a/paddle/platform/cudnn_helper_test.cc b/paddle/platform/cudnn_helper_test.cc
index 6bd85ae1ca8b47b203e0321e9d9224d5cfd3a586..427359f69713b961c4730b697d3ccde5f7085838 100644
--- a/paddle/platform/cudnn_helper_test.cc
+++ b/paddle/platform/cudnn_helper_test.cc
@@ -38,6 +38,26 @@ TEST(CudnnHelper, ScopedTensorDescriptor) {
   EXPECT_EQ(strides[2], 6);
   EXPECT_EQ(strides[1], 36);
   EXPECT_EQ(strides[0], 144);
+
+  // test tensor5d: ScopedTensorDescriptor
+  ScopedTensorDescriptor tensor5d_desc;
+  std::vector<int> shape_5d = {2, 4, 6, 6, 6};
+  auto desc_5d = tensor5d_desc.descriptor<float>(DataLayout::kNCDHW, shape_5d);
+
+  std::vector<int> dims_5d(5);
+  std::vector<int> strides_5d(5);
+  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
+      desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data());
+
+  EXPECT_EQ(nd, 5);
+  for (size_t i = 0; i < dims_5d.size(); ++i) {
+    EXPECT_EQ(dims_5d[i], shape_5d[i]);
+  }
+  EXPECT_EQ(strides_5d[4], 1);
+  EXPECT_EQ(strides_5d[3], 6);
+  EXPECT_EQ(strides_5d[2], 36);
+  EXPECT_EQ(strides_5d[1], 216);
+  EXPECT_EQ(strides_5d[0], 864);
 }
 
 TEST(CudnnHelper, ScopedFilterDescriptor) {
@@ -60,6 +80,20 @@ TEST(CudnnHelper, ScopedFilterDescriptor) {
   for (size_t i = 0; i < shape.size(); ++i) {
     EXPECT_EQ(kernel[i], shape[i]);
   }
+
+  ScopedFilterDescriptor filter_desc_4d;
+  std::vector<int> shape_4d = {2, 3, 3, 3};
+  auto desc_4d = filter_desc.descriptor<float>(DataLayout::kNCDHW, shape_4d);
+
+  std::vector<int> kernel_4d(4);
+  paddle::platform::dynload::cudnnGetFilterNdDescriptor(
+      desc_4d, 4, &type, &format, &nd, kernel_4d.data());
+
+  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < shape_4d.size(); ++i) {
+    EXPECT_EQ(kernel_4d[i], shape_4d[i]);
+  }
 }
 
 TEST(CudnnHelper, ScopedConvolutionDescriptor) {
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index bb3fec1be9e811c26cc6851314e960e96fc366b3..f4fda65907dc26e9edb91ee46f3b8bd2de7b3f3a 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,3 +1,3 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
         DEPS dynamic_loader nccl)
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index d3e4cb567d71b987724366b6a0896f5df0eb6055..761d9edd87f428ba140d29a566fc3401199bab15 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index b2d69da93bcd4a5c8e694a18ca648ddc4bd947af..61caac545014db2a09e2ada0b508419578c49740 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(cudnnSetConvolutionGroupCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/enforce.cc b/paddle/platform/enforce.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8d31bc782ec3cddd18ceaedf88fe5e7b4aed2cc
--- /dev/null
+++ b/paddle/platform/enforce.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace platform {}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index bfe708748a62ff9ac5d151bc652142e1f4925c83..415020ab965fa976c37870b7ad5794aab947fb4e 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -49,7 +49,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-namespace {
 #ifdef __GNUC__
 inline std::string demangle(std::string name) {
   int status = -4;  // some arbitrary value to eliminate the compiler warning
@@ -60,7 +59,6 @@ inline std::string demangle(std::string name) {
 #else
 inline std::string demangle(std::string name) { return name; }
 #endif
-}
 
 struct EnforceNotMet : public std::exception {
   std::exception_ptr exp_;
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index a9bcc474387513a8ca019bc9382b88c93e08ff8d..a54dc0d9fdb3c30391b01966ad493540c8ad1375 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,8 +1,8 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc tensor_array paddle_memory executor prune
+    DEPS pybind python backward proto_desc paddle_memory executor prune
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
 
-cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB} tensor_array)
+cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 5a1ff9b7976abbe4a37f8366181d9d1ae78ea4a0..6c8f06cccb92fa9cd22fdb89a9d410e6853895cc 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -202,9 +202,9 @@ void BindVarDsec(py::module &m) {
            },
            py::return_value_policy::reference)
       .def("set_shape", &VarDescBind::SetShape)
-      .def("set_data_type", &VarDescBind::SetDataType)
+      .def("set_dtype", &VarDescBind::SetDataType)
       .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("data_type", &VarDescBind::GetDataType)
+      .def("dtype", &VarDescBind::GetDataType)
       .def("lod_level", &VarDescBind::GetLodLevel)
       .def("set_lod_level", &VarDescBind::SetLoDLevel)
       .def("type", &VarDescBind::GetType)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 3d8d3f1d2fd3977f945928c723db5fcafffeae85..c16d3e0cbe01f90a5aa9a5d7a523cd4e282e4771 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -26,9 +26,7 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/prune.h"
 #include "paddle/framework/selected_rows.h"
-#include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
@@ -39,6 +37,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/cuda_profiler.h"
 #include "paddle/platform/gpu_info.h"
 #endif
 
@@ -293,6 +292,11 @@ All parameter, weight, gradient are variables in Paddle.
     Prune(*prog_with_targets.Proto(), &pruned_desc);
     return new ProgramDescBind(pruned_desc);
   });
+  m.def("inference_optimize", [](ProgramDescBind &origin) {
+    ProgramDesc pruned_desc;
+    InferenceOptimize(*(origin.Proto()), &pruned_desc);
+    return new ProgramDescBind(pruned_desc);
+  });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
@@ -390,83 +394,6 @@ All parameter, weight, gradient are variables in Paddle.
         self->CompleteAddOp();
       });
 
-  py::class_<framework::TensorArray>(m, "TensorArray")
-      .def("__init__",
-           [](TensorArray &instance) { new (&instance) TensorArray(); })
-      .def("read",
-           [](TensorArray &self, size_t index) { return self.Read(index); })
-      .def("write", [](TensorArray &self, size_t index,
-                       LoDTensor &value) { self.Write(index, value); })
-      .def("write_shared",
-           [](TensorArray &self, size_t index, const LoDTensor &value) {
-             self.WriteShared(index, value);
-           })
-      .def("size", [](TensorArray &self) { return self.size(); })
-      .def("pack",
-           [](TensorArray &self, size_t level,
-              const std::vector<std::vector<size_t>> &meta_info,
-              const std::vector<std::vector<size_t>> &lod) {
-             std::vector<DySeqMeta> meta;
-             for (auto &info : meta_info) {
-               PADDLE_ENFORCE_EQ(info.size(), 3UL);
-               meta.emplace_back(info[0], info[1], info[2]);
-             }
-#ifndef PADDLE_WITH_CUDA
-             return self.Pack(level, meta, lod);
-#else
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return self.Pack(level, meta, new_lod);
-#endif
-           })
-      .def("unpack",
-           [](TensorArray &self, const LoDTensor &source, int level,
-              bool length_descend) {
-             auto metas = self.Unpack(source, level, length_descend);
-             std::vector<std::vector<size_t>> meta_info;
-             for (auto meta : metas) {
-               meta_info.emplace_back(
-                   std::vector<size_t>({meta.begin, meta.end, meta.ori_idx}));
-             }
-             return meta_info;
-           })
-      .def("stack", [](TensorArray &self) { return self.Stack(); })
-      .def("unstack",
-           [](TensorArray &self, const LoDTensor &source) {
-             return self.Unstack(source);
-           })
-      .def("unstack_shared", [](TensorArray &self, const LoDTensor &source) {
-        return self.UnstackShared(source);
-      });
-
-  py::class_<operators::DynamicRecurrentOp, OperatorBase>(m,
-                                                          "DynamicRecurrentOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::DynamicRecurrentOp * {
-                    OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto rnn_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::DynamicRecurrentOp *>(
-                        rnn_op.release());
-                  })
-      .def("set_step_unit",
-           [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
-               -> void { self.rnn.SetStepUnit(net.Clone()); })
-      .def("get_state",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.state(name); })
-      .def("get_step_input",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.step_input(name); })
-      .def("get_step_output",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.step_output(name); });
-
   // cond_op
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
       .def_static("create",
@@ -534,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
 
   return m.ptr();
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index b5fd68839ddb62e76f2fd930248d546bc093a892..f3a6f1dba7588c6b29c1dcae26ec134c1a7f937d 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -57,8 +57,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | ON | Build unit tests binaries. |
-| `WITH_MKLDNN` | ON | Build with [Intel® MKL DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_MKLML` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) support. |
+| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_C_API` | OFF | Build capi libraries for inference. |
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index e9c89eee1af1fcc4a7f168af5ec8b16912616687..a2fdc5ce69bfdf0fadb808e4b51c8eef4ff7dfd6 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -16,11 +16,13 @@ function cmake_gen() {
         echo "using python abi: $1"
         if [ "$1" == "cp27-cp27m" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
         elif [ "$1" == "cp27-cp27mu" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
@@ -34,8 +36,7 @@ function cmake_gen() {
         ${PYTHON_FLAGS}
         -DWITH_DOC=OFF
         -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-        -DWITH_MKLML=${WITH_MKLML:-ON}
+        -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-ON}
         -DWITH_SWIG_PY=ON
@@ -56,8 +57,7 @@ EOF
         ${PYTHON_FLAGS} \
         -DWITH_DOC=OFF \
         -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-        -DWITH_MKLML=${WITH_MKLML:-ON} \
+        -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-ON} \
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
@@ -146,7 +146,7 @@ function gen_dockerfile() {
     DOCKERFILE_GPU_ENV=""
     DOCKERFILE_CUDNN_DSO=""
     if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
         DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
     fi
 
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index b9a49526a7e02131767a4e9b26cd0b53278176d0..d71cb84df3785008ea5793519fc26a174e1b95f7 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,8 +18,8 @@ function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
         echo "    with_gpu: @WITH_GPU@"
+        echo "    with_mkl: @WITH_MKL@"
         echo "    with_mkldnn: @WITH_MKLDNN@"
-        echo "    with_mklml: @WITH_MKLML@"
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
@@ -45,8 +45,8 @@ function ver2num() {
 
 function cpu_config() {
   # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
-  # only when MKLDNN or MKLML enabled
-  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+  # only when MKL enabled
+  if [ "@WITH_MKL@" == "OFF" ]; then
     return 0
   fi
   ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
@@ -70,8 +70,8 @@ function cpu_config() {
 function threads_config() {
   # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
   # according to trainer_count and total processors
-  # only when MKLDNN or MKLML enabled
-  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+  # only when MKL enabled
+  if [ "@WITH_MKL@" == "OFF" ]; then
     return 0
   fi
   processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 973b2736e5ce2b733d52df4f5a270b296bca2cac..7d54f0254c8ea9367a34233602293db5b8593f9a 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -6,13 +6,14 @@ mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn
 
 # check websites for broken links
-linkchecker doc/en/html/index.html
-linkchecker doc/cn/html/index.html
+# It will be failed now!
+#linkchecker doc/en/html/index.html
+#linkchecker doc/cn/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index b68e29cd5ea223272151e7a8b52d998832f47103..3e4a2b5fa8a3981f6362edc1dc61ae1616e257ef 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
     }
   }
 
+  if (FLAGS_use_mkldnn) {
+    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
+  }
+
   if (testing) {
     LOG(INFO) << "trainer: in testing mode";
     if (config_->getOptConfig().use_sparse_remote_updater() ||
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index f01ad4142d4fe7c7f7d7aac60d967ea114b93e56..2739878b7f2936ea2da689da0b4caa780516ccc1 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -11,7 +11,6 @@ add_unittest_without_exec(test_Trainer
     test_Trainer.cpp)
 add_test(NAME test_Trainer
   COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/gen_proto_data.py &&
         ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
@@ -28,35 +27,7 @@ if(WITH_PYTHON)
           ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
       WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-            --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
-############### test_CompareTwoOpts ###################
-add_unittest_without_exec(test_CompareTwoOpts
-    test_CompareTwoOpts.cpp)
-add_test(NAME test_CompareTwoOpts
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
-            --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
-            --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
 ################# test_recurrent_machine_generation ###############
 add_unittest_without_exec(test_recurrent_machine_generation
     test_recurrent_machine_generation.cpp)
diff --git a/paddle/trainer/tests/chunking.conf b/paddle/trainer/tests/chunking.conf
deleted file mode 100644
index d88df919df8fee9209336ffa29d724dabe6af31b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/chunking.conf
+++ /dev/null
@@ -1,125 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-TrainData(ProtoData(
-  files = 'trainer/tests/train_files.txt',
-  usage_ratio = 1.0,
-))
-
-TestData(ProtoData(
-  files = 'trainer/tests/test_files.txt'
-))
-
-default_initial_std(1)
-default_decay_rate(4e-4)
-default_device(0)
-
-Inputs("features", "word", "pos", "chunk")
-
-Outputs("crf")
-
-Layer(
-    name = "features",
-    type = "data",
-    size = 4339,
-)
-
-Layer(
-    name = "word",
-    type = "data",
-    size = 478,
-)
-
-Layer(
-    name = "pos",
-    type = "data",
-    size = 45
-)
-
-Layer(
-    name = "chunk",
-    type = "data",
-    size = 23
-)
-
-Layer(
-    name = "output",
-    type = "mixed",
-    size = 23,
-    bias = False,
-    device = -1,
-    inputs = [
-        FullMatrixProjection("features", parameter_name="feature_weights"),
-    #    TableProjection("word"),
-    #    TableProjection("pos"),
-    ],
-)
-
-Layer(
-    name = "crf",
-    type = "crf",
-    size = 23,
-    device = -1,
-    inputs = [
-        Input("output", parameter_name="crfw"),
-        "chunk"
-    ]
-)
-
-Layer(
-    name = "crf_decoding",
-    type = "crf_decoding",
-    size = 23,
-    device = -1,
-    inputs = [
-        Input("output", parameter_name="crfw"),
-        "chunk"
-    ]
-)
-
-Evaluator(
-    name = "error",
-    type = "sum",
-    inputs = "crf_decoding",
-)
-
-'''
-# chuck evaluator cannot be used for GPU training
-Evaluator(
-    name = "chunk_f1",
-    type = "chunk",
-    inputs = ["crf_decoding", "chunk"],
-    chunk_scheme = "IOB",
-    num_chunk_types = 11,
-)
-'''
-
-Settings(
-    algorithm = 'sgd',
-    batch_size = 100,
-    average_window = 0.5,
-    max_average_window = 2500,
-    learning_rate = 1e-1,
-    learning_rate_decay_a = 5e-7,
-    learning_rate_decay_b = 0.75,
-    l1weight = 0,
-    l2weight = 1,
-    c1 = 0.0001,
-    backoff = 0.5,
-    owlqn_steps = 100,
-    max_backoff = 5,
-)
diff --git a/paddle/trainer/tests/compare_sparse_data b/paddle/trainer/tests/compare_sparse_data
deleted file mode 100644
index 18fc6541383d8e8e1687b8fe1abd57aece3d4cfc..0000000000000000000000000000000000000000
Binary files a/paddle/trainer/tests/compare_sparse_data and /dev/null differ
diff --git a/paddle/trainer/tests/data_bin_part b/paddle/trainer/tests/data_bin_part
deleted file mode 100644
index 66ede391b0cffe6bc9611d3616b7b626864f5c3e..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/data_bin_part
+++ /dev/null
@@ -1,214 +0,0 @@
-F
-��X
-��X
-��X
-��X
-��X
-��X
-��X
-��X
-���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I86��H��C��=��T��F��T��Iַ;��W��8��T��;��8��T��J��J��8��T&$��H��=��T��F��T��I��W��8Ю+��J��J��8���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I86��8��T��8��T��&�9��C��6��H��C��=��T��F��T��Iַ;��B��T&$��8��8��&Ӗ5��H��=��T��F��T��I��B��T���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IVT��H��C��=��T��F��T��Iַ;��8��T��8��TͅT�T��8��T��&�8��6�;��8��T��@�N��8��T��8��T;9��H��=��T��F��T��I��8��8��8��8��&�8��6�;��8��@�N��8��8��H��C��=��T��F��T��Iַ;��H��=��T��F��T��IMK��H��C��=��T��F��T��Iַ;ٟ@��1��7ȣ8��Gȣ8�/��>��7��;��B��A��U��Q��U��T��0A?��H��=��T��F��T��Iٟ@��1��7��G�/��>��7��;��B��A��U��Q��U��T��0���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I����.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;����'���J��A��-��E�J��@��8��T��-��Eބ2�4��8��TYW��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I����A��M��1��8��Mބ2�4��8���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��K��H��=��T��F��T��I��@��K���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I#!��1��4��UƕT��6��.��Q��8��T��@Ԛ<��1��4ƕT��6��.��Q��8��@Ԛ<���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IVT��H��C��=��T��F��T��Iַ;��8��T��8��TͅT�T��8��T��&�8��6�;��8��T��@�N��8��T��8��T;9��H��=��T��F��T��I��8��8��8��8��&�8��6�;��8��@�N��8��8��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;ܥ6��H��=��T��F��T��Iܥ6���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I;9��H��C��=��T��F��T��Iַ;��Q��;��B�� �������������!��H��=��T��F��T��I��Q��B���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I53��H��W��8��T��;��8��T��8��T��H��C��=��T��F��T��Iַ;#!��H��W��8Ю+��8��H��=��T��F��T��I���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I&$��H��C��=��T��F��T��Iַ;��V��G��D��; ��H��=��T��F��T��I��V��G��D��;�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G����G͡S�<��%����&б��̣ ��Fۧ1��1ņAǧ1ņAņA�<��6ҥ3߫U��V�K��T��V��U��6��>��V��M��U��F��>��M��5��%��������������̋'wu��G͡S�<��%������̣ ��Fۧ1��1ņAǧ1ņAņA�<��6��U��V�K��T��V��6��>��V��M��U��F��>ʶM��%��������������̋'�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G̣ ��'��@��@��@	���@��@�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G&$��O��4��=ӪN��/��>��K��/��;��8�,��T ��O��4��=ӪN��/��>��K��;��,��T�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G><��,��9��O��8��.̣ ������T��B����0��O��!��.�/��W��D��S��W53��,��9��O��8��.��T��B����0��O��!��.�/��W��D��S��W�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G��:��=��X̣ ��Q��U��T��G܂=��X̣ ��Q��T��G�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G)'��=������	��0̣ ��M��6ͅT��O��,��@Ԛ<#!��=ؐ��0̣ ��M��6ͅT��O��,��@Ԛ<�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G/-��=������	��0̣ ��M��6ͅT��O��,��D��S�D��A)'��=ؐ��0̣ ��M��6ͅT��O��,��D��S�D��A�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G	̣ Ҧ)��G��G���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ20��4��A�Q��.ŞGщQ��H��A��V��T��J��D��8��D��A��P&$��4��A�Q.щQ��H��A��V��T��D��8��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��R��4�Q��>��.ŞG��GщQ��6��?��@Ԛ<#!��R��4�Q��>.��GщQ��6��?��@Ԛ<���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��4�Q��.ŞG��J�I��GщQ��D��S�D��A#!��4�Q.��J�I��GщQ��D��S�D��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��.ŞGٟ@��6��G��5�I��GщQ��A�7��B.ٟ@��6��G��5�I��GщQ��+���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4�Q��>��.ŞG��D��A��P��;��0��T��?��6��T��)����! ��4�Q��>.��A��;��T��6��T��)���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4�Q��>��.ŞG��D��A��P��;��0��T��?��6��T��)����! ��4�Q��>.��A��;��T��6��T��)���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ><��4��9��K�Q��.ŞG��R��G��D��9��H�O��K�J��A��.ŞG��=�R��J/-��4�-�Q.��R��G��D��9��H��K�J��A.�R��J���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4��A��I�Q��.ŞGщQ��H��A��V��T��J��D��8��D��A��P)'��4��A��I�Q.щQ��H��A��V��T��D��8��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ/-��4��=�R��4�Q��>��A��E��.ŞG��C��/��W��9��9 ��4�R��4�Q��>����C��/��W��9���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5��U��P��H���>��G��@Ԛ<��U��P��H���>��G��@Ԛ<���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5��A��H���A��C��P��G��@Ԛ<��A��H���A��C��P��@Ԛ<���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5MK��H���FșK��>��7��Q��K�H��.��C��Q��R��>J��M��B��>��W��M��L��G��,��@Ԛ<MK��H���FșK��>��7��Q��K�H��.��C��Q��R��>J��M��B��>��W��M��L��G��,��@Ԛ<���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��553��A��H��M��D��P�5��8��Qٟ@�H��3��/��A��@��@��@/-��A��H��M��D��P��8��Qٟ@�H��3��/��A��@��@���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5#!��A��H���A��C��P��G��D��S�D��A ��A��H���A��C��P��D��S�D��A���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5YW��I��=��=�R��>��H���/��/��G�M��>ϪJ�R��K��2��2��U׵A��H��T��U��A��6�����)��ʪYW��I��=��=�R��>��H���/��/��G�M��>ϪJ�R��K��2��2��U׵A��H��T��U��A��6�����)��ʪ���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5 ��6��P��H����>�5��H��O��A��B ��6��P��H����>�5��H��O��A��B���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5��H���G��2��2��A��@��@��@��H���G��2��2��A��@��@�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O20����N߹-��7��B�O��1ַ;��L߹-��N��A��7��O��Iַ;)'����N߹-��7��B�O��1��;߹-��N��A��7��I�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O,*����N߹-��B�O߹-��7�O߹-ַ;�OʈF��<��4)'����N߹-��B�O߹-��7߹-ַ;�OʈF��<��4�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O&$��A����N߹-��B�O��>��8ֽHٟ@��@Ԛ<#!��A����N߹-��B�O��>��8ٟ@��@Ԛ<�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O/-�
-������N߹-��C��7��F��B�O��R��1��:��?��T)'�
-������Nں-��7��B�O��R��1��:��?��T�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O��߹-��7�O߹-��B��T��߹-��7߹-��B�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O/-����N߹-��B�O��7��F�O��O��?��L߹-��OǧB��T)'����N߹-��B�O��7�O��O��?��L߹-��O��T�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O><߹-����N��L��B��7��F�O��QӮD��D�A��4��0�A��T����(����",*߹-����N��L���O��QӮD��D��A��0�A��T�������B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O,*��߹-��7��B�OİU��1��>��C��B��B�U��Q��4,*��߹-��7��B�OİU��1��>��C��B��B�U��Q��4�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/><�
-������/��@ʡH��9��H��1��R��L��A¶7��/��J��D��O��8�,��T#!����N��9��1��L��N��/��J��D��,��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/b`��1��R��L��D��A¶7��/�
-������J��0��E��K��B��8�/��/��O��E��Kю2��E��,��/��W��T�����)��ʪDB��1��L��D��N��/����J��0��K��B��8�/��O��Eю2��E�������)��ʪ�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/20�
-������1��R��L��A¶7��/��J��0��E��O����@��K&$����1��L��N��/��J��0��E��O����@��K�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/��>��T��7�O��=��P��;��>��7��=��P�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��//-��D��A¶7��/��1��R��L��JʡH��W��W��T�%����! ��D��N��/��1��L��JʡH��W��WՄO�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/><�
-������N��1��R��L��A¶7��C��H��2��3��1��R��L��A¶7��/��/&$����N��1��L��N޻/��2��3��1��L��N��/�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��//-����L��G��R��1��¶7��/��1��7�>��>��G��<��T)'����L��+��¶7��/��1��7�>��>��G��<��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/JH��A¶7��/��C��1��R��L��H��7��/����N����=��,��:�:��8��4��S��Q��H�9��T86��N��/��C��1��L��+����N����=��,ў8��4��S��Q��H�9��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/DB�
-������/��@ʡH��9��H��1��R��L��A¶7��/��J��D��O��E��J��<��B��B,*����N��9��1��L��N��/��J��D��E��J��<��B��B���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K><����N��T΂:��8��/��C��T΂:��K��T΂:��W��J��T΂:ì,��U��W��J&$����N��T����C��T�:��Tژ<��TЂ:�����T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K\Z��E��@��T΂:��:��T����T΂:��8��/��6����T΂:��8��/��K����T��8��/��;����T΂:��/��8��E��@JH��E��@��T΂:��:��T����T����6����T����K����T��8��;����T΂:��/��E��@���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K ����N��,΂:��8��/��K��4��?�I����N��,����K��4��?�I���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K86��΂:��8��/΂:��8��/��6��H΂:��/��8��K΂:��8��/��C��T#!��������6��H΂:��/��K����C��T���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K><����N��T΂:��8��/��C��T΂:��K��T΂:��W��J��T΂:ì,��U��W��J&$����N��T����C��T�:��Tژ<��TЂ:�����T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K����N΂:��8��/��K�K΂:����N����K�K΂:�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ\Z��R��<��L��#��%��6��K��9��T��V��4��6��V��6��#����6��#��%��6��#����6��$����6��#�8���8GE��R��<��L��#��6��K��9��V��6��V��6��#��6��#��6��#��6����6��#�8���8�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ;9��>�R��>��%��B��>ڜ>��A��9��T��K�9�1��A��#��%��@��@��@20��>�R��>��%��B��>ڜ>��A��9��K�9�1��A��#��@��@�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ#!��#��%��9��T��Kڜ>��B��E�I��U��T��#��9��Kڜ>��B��E�I��U�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ#!��#��%��K��9��T��D��0��6�O��@Ԛ<��#��K��9��D��0��6��@Ԛ<�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ��#��%��9��T��CۚK��@Ԛ<��#��9��CۚK��@Ԛ<�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQGE��6��W��#��%��>��9��T�?��#��%��6��O�/�O��O�/��U��!�'��B�8��>ڜ>;9��6��W��#��>��9�?��#��6��O�/��O�/��U��!�'��B�8��>ڜ>�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQYW��#��%��9��T��>��K��-��A��9��6��T��W��B��:��O��S��R��Q��9��#����%ѾC��H��T��L��6��L��TJH��#��9��>��K��A��9��6��T��W��B��:��O��S��R��Q��9��#����%��5��L��6��L��T�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��, ؓ���=��BܤK��S��/��C��8��Tœ�=��BܤK��S��8��T,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,DB��G��D��G��>��W��-��3��M�8��F�=��Bٟ@��6��S��9ܤK��ȟN��	��U��686��G��D��G��>��W��3�8��F�=��B��5��S��9ܤK��ȟN��	��U�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,><��E��R�=��B��6�O��7��>��T��H��H�8��@��9��F��F��S��A��@Ԛ<53��E��R�=��B��6��7��>��H��H�8��@��9��F��F��A��@Ԛ<,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,PN��8��4��C�8�1�=��B��R��V��T��6��C��A��E��/��:��6�L��U��U��NԛL��@��;��6��GDB��8��C�8�1�=��B��R��V��T��C��A��E��:��6�L��U��U��NԛL��@��6��G�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,JH��H�=��B��/��-�8��>ܤK��D��A��9��=��S˱U�8��Q��TָU��J�����)��ʪDB��H�=��B��/�8��>ܤK��D��A��9��=��S˱U�8��Q��T�U�����)��ʪ,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,GE��A��B��R��B��E�9��A��6��BϜ>�8�=��B��6ץR��R��D��O��6ө����ۆ	;9��A��B��R��B��E�9��A��6��BϜ>�8�=��B��6ץR����6ө��,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,;9��R��Q��S��A��E��M�8�=��B��>ץR��9��)��N��U��6��!��G��J53��R��Q��S��A��E��C�=��B��>ץR��9��)��N��U��6��!��1,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,/-��V��J��V�1�8�=��B��R��6��?��#��%��@��@��@)'��V��J��V�1�8�=��B��R��6��?��#��@��@�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,����D��>��EȊ5��6��R��T���8��J��F�=��B��K��T��:�8��J�=��B��R��F��K��,��3��4��D��H��@��CӽDҾW��K��?��>��S��@��9�9��I��S��D��P�D��Azx��D��>��Eˊ5��R��T���8��S�=��BАT��:�8��J�=��B��R��F��,��3��4��D��H��@��CӽDҾW��K��?��>��S��@��9��I��S��D��P�D��A,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,����V��6�8��B��Xʉ5�=��B��>ܤK��%��������&��Ξ)ʉ5��V����T��V��E����X��G��V����X��G��V���8��G������&��Ξ)��V��E����B��V��Ɣ>��X��V����U��8P��=ۚK��C��>��J��U̟K��O��4��>��L����V��6�8��B��X�=��B��>ܤK��%��������&��Ξ)ʉ5��V����T��V��E����X��V����X��V���8��G������&��Ξ)��V��E����B��V��۔>��V����U��8��=��C��J�.��4��>���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����D��C��G��R��@��N����D��C��G��@��N���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X#!����6��C��G��Dʉ5��>��R������#!����6��C��G��Dʉ5��>��R���������H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X86������G��R�>��R��P��>��R��6��9�9��V��A��D��S�D��A20����G��R�>��R��P��>��R��6��9��V��A��D��S�D��A���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X#!����D��R߻W�9��9������@��@��@����D��R߻W�9��9����@��@���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����U��V�1�;��2��X��4����U��V�1�;��2��X���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��Xnl��>��A����6߻W��$��6��X��T��6�/ҥ3��)��T��:��6��X��-��6��M��E��@��E��U��%�������!�����)�������!MK��>��A����6߻W��$��6XɺRҥ3��?��:��6��X��-��6��E��@��E�������)����P���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��XA?������6��=��C߻W��E��D��>�3��K֟MȬT��T��(����#����$����!,*����6��=��C߻W��E��D��>�3��K֟MȬT��T�����H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X;9����6��6��GȂ3ʉ5��>��R��>��B��������C��T��6��;��3��D53����6��6��GȂ3ʉ5��>��R��>��B������C��T��;��3��D���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����D��C߻W��R��1��@��K����D��C�W��1��@��K�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6GE��6��/��K�Oٟ@P��=��>��8��E�9��R�B��H��A��V��T��J��D��8��D��A��P53��6��/�Oٟ@��=��>��8��E��R�B��H��A��V��T��D��8��A�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6&$��C��K�O�I�9��R�B��2��S��C��I��9��C�O�I��R�B��2��S��C��9�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6)'��L��P��K�O�9��R�B����6��P����6��T ��L��P�O��R�B����6��P����6�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6PN��6��K�O�9��R�B��E��I��T��6��>��S��K��?��K��IP��=��>��K��I��9��0��C��9��T><��6�O��R�B��E��I��T��6��>��S��K��?��K��=��>��K��9��0��C��T�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6,*кB��P��K�O��K��=��9��F�9��R��H��G��8��T#!кB��P�O��K��=��9��F��R��H��G��8�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6JH��H��K�O��>��6��/P��=�9��R��H��>��D��A��P��;��0��T��?��6��T��)����!/-��H�O��>��6��/��=��R��H��>��A��;��T��6��T��)�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6MK��K�O��6��/P��=��K�O��R��D�B��6�O��K��K�O��6�9��6��K�O��6щQ��@Ԛ<53�O��6��/��=�O��R��D�B��6��K�O��9�O��6щQ��@Ԛ<�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6,*��6��/��K�O�9��R�B��DǬ<��C��I��I�?��9 ��6��/�O��R�B��DǬ<��Cڗ?��9�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6><��A��2��I��P��K�O��K��G��C��=�9��R�B��D�9��6��D��P�D��A20��A��2��I��P�O��K��G��C��R�B��D��6��D��P�D��A���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=SQ��$����(����0�?�=��6��C�9��A��T��0��7��R��L��@��;��I��6��U��L��I��G��8��2��TMK��$������0�?�=��6��C�9��A��T��0��7��R��L��@��I��6��U��L��I��G��8��2��T���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=86��C�9��A��T��0�?�9��-�8ٟ@��6ǽ=��E��X��Eŧ;��>��P/-��C�9��A��T��0�?�9��-�8ٟ@��6��E��E��>��P���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=20ʻ?��<��C�9��A��T��0��6��AщQ��J��@��X��@��@��@/-ʻ?��<��C�9��A��T��0��6��AщQ��J��@��X��@��@���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��0�?��<��6��C�9��A��T��A�7��B��0�?��<��6��C�9��A��T��+���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=JH��D��>��0�?��9��<�9�=��C�9��A��T��V��B��$�����/��?��B��RÙK��B��TA?��D��>��0�?��9��<�9�=��C�9��A��T��V��B��$�����/��B��E��B���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=��0�?��<��C�9��A��T��0�?��<��C�9��A��T���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=PN�9��T��0�?��5��5�=��-щQ��X��S��C��A��E��/��:��6�L��U��U��NԛL��@��;��6��GGE�9��T��0�?��5��5�=��-щQƇX��C��A��E��:��6�L��U��U��NԛL��@��6��G���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=,*��0�?��<��6�9�=��C�9��A��T��D��S�D��A,*��0�?��<��6�9�=��C�9��A��T��D��S�D��A���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=)'��0�?�9��<��C�9��A��T��6�O��P��B��6#!��0�?�9��<��C�9��A��T��6��P��B���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��D��6��@Ԛ<��B��7��Uח>��D��6��@Ԛ<���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��Uח>��@��K��Uח>��@��K���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>ͦB��O��E��R��B��7��Uח>ͦB��O��E��R���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��8��;��BٖT��T��B��7��Uח>��8��;��B��T���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>86��A��H��F��S��=��@��=՞R��U��7��0ח>��G��D��S��PԮK߀320��A��H��F��=��@��=՞R��U��7��0ח>��G��D��S��PٮK���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח> ��B��7��Uח>��D��T����(����"��B��7��Uח>��D��T�����6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��6��R��T��U��7��HˮD�D��A��6��T��U��7��HˮD�D��A���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��B��7��Uח>���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>20��A��H��F��S��=��@��=՞R��U��7��0ח>��G��P��B��6,*��A��H��F��=��@��=՞R��U��7��0ח>��G��P��B�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	MK��7��<��X��7��Q��L��4��4��R�W��5���������Q��-��<��>��;��������G��B;9ӱ��Q��L��4ߩ7��5���������Q��-��<��>��;��������G�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	��N��<��;��0��@��K��,��N��;��0��K��,�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	#!��<��X��4��R�W��>��4��8��@��@��@��<ߩ7��>��4��8��@��@�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	/-�4��P��P��PՈP��R��L��1�4��:����N��P��P��P&$�4��P��P��PՈP���:����N��P��P��P���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K/-��3ϊX��4��C��3��7��Q��7����3ϊX��@����@��@ ԊX��4��C��3��3��ԊX��@����@���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K ��RН?��3ϊX��Q׆N��S��?��4�8��RН?ԊX��Q��N��?��4�8���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K&$��Q��T��3ϊX��7��Q��7׆N��S��?��4�8��QԊX��3��N��?��4�8���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K/-��I��7��Q����F��7��3ϊX��>��F�9�Q��?��WɤK��IԊX��>��F�9�Q��?��WɤK���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K&$��R��3ϊX��4��6߻W��L��Q��G��8��@Ԛ< ��RԊX��4��6߻W��L��Q��G��@Ԛ<���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K><��3ϊX��R��7��Q��7��@��4��7��5�@����:ȥ����B��@����A��T/-ԊX��R��3��@��4��7��5�@����:ȥ����B����A���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K����"����!��F��>��"��F���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��720��D��N��/��E��L��>�7��Aڶ>��F��7��C��Dƹ;��@Ԛ<,*��D��NȜM��L��>�7��Aڶ>��F��7��C��4��@Ԛ<���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7)'��H��N��/��K��N��/ڶ>��F��7��=��A�7��B#!��H��N��/��K��N��/ڶ>��F��7��=��+���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7 ��H��/��6��7P��=��D��G��@��K��H��/��6��7��=��D��@��K���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7ܤK��K��A�7��B	ܤK��K��+���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7�����7��>��1��T֛7ٟ@��9��F��6��U��>ʔ7��1��/��>ٟ@��6��L��D��7��>��/��I��/��>��=щQ��D��D��H��I��N��.��/��5�9Ԛ<ڶ>��S��-��=��D��N��@��U��W��=��-щQܭD�H��T��D��S��=��D��S�D��A�����7��>��1֛7ٟ@��9��F��6��U��>ʔ7��1��/��>��5��L��D��>��/��I��/��>��=щQ��D��D��H��I��N��.��/��5��1��S��-��=��D��N��@��U��W��=��-щQܭD�H��T��D��S��=��D��S�D��A���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��=�8��T��=��4ڶ>��F��7��S��@��@��@)'��D��N��=�8��T��=��4ڶ>��F��7��S��@��@���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7 ��H��/��6��7��Dƹ;��D��G��@��K��H��/��6��7��4��D��@��K���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7><��D��1ƹ;��T��Dƹ;��>��S��=��>��7ʗ7��4��=��>��S��B��7��S��T86��D��1ƹ;��T��4��>��S��=��>��7ʗ7��4��>��S��B��7��S��T���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7��V��/��6��7��=��D��G��@��K��V��/��6��7��=��D��@��K���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A><��E��R�9��U��5��1����$��L��2��;��N��@��6�1��O��D��S�D��A86��E��R�9��U��5��1����L��2��N��@��6�1��O��D��S�D��A���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��ASQ��$��U��S�/��1��6��1��.��T�9��A��6��A��PɺD��E��X��>��EѾC��T��8��6��V��O��T��BA?��$��U��S�/��6��T�9��A��6��A��PɺD��E��X��E��T��8��V��O��T��B���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A53��E��R�9�1��@��5��H��1��Bٟ@��4�9��A��E��@��@��@/-��E��R�9�1��@��5��H��1��@��4�9��A��E��@��@���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A,*��$�9���6��5��6��5��4�9�Q��5؂=��@Ԛ<,*��$�9���6��5��6��5��4�9�Q��5؂=��@Ԛ<���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��ADB��"��Ξ)����6�9��$��R��5��4�9��A��Iٟ@��T��N��>��C��J��@��@Ԛ<><��"������6�9��$��R��5��4�9��A��@��T��N��>��C��J��@��@Ԛ<���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A;9��E��4��W��N��$��R��B��5��H��4��L��D��L��IĪN��C��S��@��K;9��E��4��W��N��$��R��B��5��H��4��L��D��L��IĪN��C��S��@��K���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A53��@��;��5��R��H����$��U��L��T�9��A��6��D��P�D��A/-��@��5��R��H����U��L��T�9��A��6��D��P�D��A���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A86��D��P��>��E��5��H��"����$ĪN��L��=��4�9��6��A�7��B/-�P��>��E��5��H��"����$ĪN��L��=��4�9��6��+���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H/-��:��/��SʡH��9��9��9��?��9��9��?��D����6��T��:��S��9��9��9��9��D����6���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��:��/��SʡH��9��9��9��?Έ;��F��:��S��9��9Έ;��F���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��9��?��<��V��V��:��S��9��9��<��V���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H20��S��:��/��SʡH��9��9��9��?ёC؄/��H��;��0��D��T��:��S��9��9ґC��H��;��D��T���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H)'��:��/��SʡH��9��9��S��:��?��B��6݆.��T��:��S��9��:��B݆.��T���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��9��?��:��/��SʡH��9��9��?��T��9��:��S��9��?��T���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H��B��<��V�?��-��=�R��J��B��<��V�?�R��J���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��:��/��SʡH��9��9��9��?Έ;��F��:��S��9��9Έ;��F���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��Kwu��7��R��D��H��>��<��>��K���,��0��7������R��2��������
-�.��T������ʆ��L��@ϡS��4��,ܢE��M��,�.��O��2��J��6MK������R��2��������
-�.��T������ʆ��L��@ϡS��4��,��E��,�.��O��J��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��KMK��R��D��H��>��<��>��K�,��0��I��O��9��4��9��1یV��0P��=P��H��>�.�E��6A?��R��H��>��<��>��K�,��0��I��O��9��4��V��0��=P��H��>�.�E��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K#!��R��D��>��H��<��K��1��Q��@��@��@��R��>��H��<��K��1��Q��@��@���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K/-��D��H��>��K��=��<��,��D��6�R��=��4��,��@Ԛ<&$��D��5�4��,��D��6�R��=��4��,��@Ԛ<���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K����7��R��D��H��>��<��>��K���2��>��7�.ʆ��J��6������ʆ��G������1��?������P��=��1��?����I��2����K��7����>��>����M��G����MߎM������6��>��J��Rʆ�.��J��6�~�.ʆ��J��6������ʆ��G������1��?��������=��1��?����I����K��7����>����M��G����MߎM������6��>��J��Rʆ�.��J��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K20��R��D��H��>��<��>��K�2��>��J��6��/��;��I��N��9,*��R��H��>��<��>��K�2��>��J��6��/��;��N��9���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K_]��R��D��H��>��<��>��K�2��>ʆ��>��I��2́���N��4��TȇN��4��T��I��(Pބ2��>�N��4ʆ�N��4GE��R��H��>��<��>��K�2��>ʆ��>��I��(�N��4ȇN��4��I��(܉2��>��Nʆ��N���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��KGE��R��D��H��>��<��>��K�2��>��J��>��I��2ˏR��3˰(��I��B��>P�3ˏR��2;9��R��H��>��<��>��K�2��>�J��IˏR��3˰(��I��B��>�3ˏR��2�ʰD��B��N��M��G��>��B��M��G��>JHɵO��9��F��D��S��C��4ʰD��B��N��5��>��3��5��-��=�9��O��2���:��@��@��@53ɵO��9��D��S��C��4��B��5��>��I��-��=��O��2�G��@��@ʰD��B��N��M��G��>��B��M��G��> ��L��N��L��BʰD��B��N��@��@��@��L��N��L��B��@��@�ʰD��B��N��M��G��>��B��M��G��>JHɵO��9��F��D��S��C��4ʰD��B��N��5��>��3��5��-��=�9��O��2���:��@��@��@53ɵO��9��D��S��C��4��B��5��>��I��-��=��O��2�G��@��@ʰD��B��N��M��G��>��B��M��G��>)'�$������"��;��0��Q��8ҐJ��9ҽ6��W��H)'�$������"��;��0��Q��8ҐJ��9ҽ6��W��H���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A ��4��A��R��=��J��	��D��G��@��K��4��A��R��=��	��D��@��K���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��/��4��A��R��Q��=��J��B��4��/��4��A��R��Q��=��B���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A><İF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��WН?��/Н?��T,*İF��B��T��V��L��8��A��R��O��8��L��AН?��-���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A20İF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��W&$İF��B��T��V��L��8��A��R��O��8��L��A���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��APNİF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��W��WН?��W��?��U��U��W��TН?��>;9İF��B��T��V��L��8��A��R��O��8��L��A��W��W��?��U��U��W��?���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A,*İF��E��1��;��T��V��L��8��O��3߫UТ@��H��T&$İF��B��T��V��L��8��O��3߫UТ@��H��T���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��A��R��4��J��A��R��4��J���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��L��/��4��A��R��Q��>��L��/��4��A��R��Q��>���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A&$ŷ5��/��B��A��R��4��J��X��>��<��B��B#!ŷ5��/��B��A��R��4��X��>��<��B��B���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��H��H��9��X�RʹE��>��B��H��H��9��;��>��B���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9��C��R��9��Xǌ8��@Ԛ<��C��R��9ǌ8��@Ԛ<���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9����9��X��U��T��I��9��X��N�S��;��U��O��Iַ;��U��R��I��I��I�K��I��H��B��O��F��;��F��;��<��U��O��U��M��U��B��I��O��U��-��4��I��9��P��;��P��-��7��;��U��R��I��4��;��Vnl��9��U��I��9��S��G��O��I��U��I��I��I��B��O��F��F��<��U��O��U��U��I��O��U��-��I��9��;��P��7��G��R��I��4��;��V���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��9��C�5��I��9��1ӛ?��6��9�; ��9��F�5��I��9��1ӛ?��6��9�;���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��U��C��;��-��9��U��C��-���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9\Z��G��9��X��W�F��5ԎB��@��J��P��1��1�<��T��I��:��2��O��:��9��X��C��E��I��>��.��3��>��7��2PN��G��9��W�F��5�B��J��P��1��1�<��T��I��:��2��O��:��9��C��I��>��.��3��>��7��2���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�986��9��X��B�9ԎB��@��@��O��L��W�F�R��9��B��9��<��C��T/-��9��B�9�B��@��O��L��W�F�R��9��B��9��<��C���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��9��X��5��9�Q��C��ͦ(����!��9��9��5ƋQ��C�����9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��@��?��9��@���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9YW��9��X��B�9ԎB��@��>��5��4��W�F�R��9��B��9�� ��I��C��7��0��FŔ6��A�D��M��Iַ;��7��0DB��9��B�9�B��>��5��4��W�F�R��9��B��9�� ��I�C��0��FŔ6��1��I��7���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9DB��9��X��C��K��2�9�R��5��>��9��X��W��A��/��1��C��2��O��D�K��O��D53��9��C��2��R��5��>��9��W��A��1��C��2��O��D�K��O��D���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��>��K��T��CΚI�RН?��>��A��T��9��>��K��CΚI�R��?��A���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�920��C��C��T��C��7��V��C��E��I��Cַ;��C��;��-��C��T��C��C�C��VĸIַ;��C��-��C���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9DB��H��W��:��9��X��B�9ԎB��@��=ŉE��D��W�F�R��9��B��9��9��X��C��T86��H��W��:��9��B�9�B��=ŉE��D��W�F�R��9��B��9��9��C���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��@��T��9��X��R��0ܥ6��9��@��T��9��Rܥ6���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9��G��7��;��C��T��G��7��;��C���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9A?��Hʜ2��R��A��@�R��S�9��@��>��9��X��3��>��)��כ$��>��;��G��B;9��Hʜ2��R��A��@�R��S�9��@��>��9��3��>��)��כ$��>��;��G���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9><����N�R��=��9��X��C��9��S�9��9׵A��A��K��E��A��A��B��C��/;9����N�R��=��9��C��9��S�9��9׵A��A��K��E��A��A��B��C��/���=��@��K��E��=��=��@��E��=><��@��Q��0��H��@��KûA��Q��H��@��KûA��Q��,��H��P��H��C��B��020��@��0��H��@ûA��Q��H��@ûA��Q��,��H��P��H��B��0��=��@��K��E��=��=��@��E��=��=��@��J��@��K��I��5��@��=��@��J��@��I��5��@���=��@��K��E��=��=��@��E��=/-��7ûA��K��3��@��3��@��K��7��K��K��3������!#!��7ûA��K��3��@��3��@��7��K��3����=��@��K��E��=��=��@��E��=86��=��@��K��A��K��C��K��-��3��O��?��3��3��7��7����C��T)'��=��@��A��K��C��K��-��.��?�.��7����C���=��@��K��E��=��=��@��E��=/-��K��6��S�5��@��K��E��=��4��I��,��S��@��@��@)'��K��6��S�5��@��E��=��4��I��,��S��@��@��=��@��K��E��=��=��@��E��=��@��K��@��?��@��@���=��@��K��E��=��=��@��E��=)'��C��@ַ;��C��@��G��C��@��K��=��@��A��B&$��C��@ַ;��C��@��G��C��@��=��@��A��B��=��@��K��E��=��=��@��E��=DB��I��K��@��K��Q�Oַ;�O��E��6��V��=ԋ����J��>��J��T��7��L��J��653��I��K��@��Q�Oַ;�O��E��6��V��=�J��J��7��L��J��6�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+,*ԃP��;��Q��8ȘI��K��5ܢE��4��N��>��4��O�J��A��Q��8��K��5����>��4�O�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+53��Q�1��U��?��TܢE��4��N��=��D��.��4ԃP��;߽4��G��3 ��Q�1��U��?����D��4��A߽4��G�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��X��Q��T)'��A��H��Q��8��K��5����C��>��4б��X��Q�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+MKԃP��;��H��Q��8ȘI��K��5��N��C��>��4ԃP��;��Q��:��3��3ȘI��J��8��2��T��H��A><��A��H��Q��8��K��5��N��C��>��4��A��Q��:��3��3ȘI��J��8��2��H�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+/-ԃP��;��H��L��-��TܢE��4��N��C��4�1��TН?��> ��A��H��L��-��T����C��4�1��?�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+,*ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4 ��A��H��Q��8��K��5����C��>��4�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+)'ԃP��;��E��7��2��TܢE��4��NŇ7̛<��U��T��A��E��7��T��Ň7̛<��U�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��3��Q��T)'��A��H��Q��8��K��5����C��>��4б��3��Q�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��T��?��T��CܢE��0&$��A��H��Q��8��K��5����C��T��?��T��/�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��XН?��2��J&$��A��H��Q��8��K��5����C��>��4��X��2�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?̛<��E��T)'��A��H��Q��8��K��5����C��>��4��?��E��T�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?̛<��U��T&$��A��H��Q��8��K��5����C��>��4��?��U�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+#!��Q�1��NÚQ��8ȘI��K��TԃP��;��4��Q�1��N��8��K��T��A��4�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+53��-ԃP��;��HܢE��4��N��C��;�>��C��1��A��1��J��>��=)'��-��A��H����C��;�>��C��1��Aܹ1��>��=�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+�~��4����N��U��.̤3��@��>ϥJ��=��T��.��-��0ܢE��4��N��5��H��0�1ԃP��;��R��:��?��=��N��.̤3��@��>��P��T��T��>��J��F��F��8��G��3b`��4����N��U��.��LϥJ��=��T��.��-��0����5��H��0�1��A��R��:��=��N��.��L��P��T��>��J��F��F��8��G�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4��U��/��T&$��A߽4��Q��8��K��5����,��4��U��/��T�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+,*��Q�1ʡH��9��B�X��TܢE��4��NН?̛<��7��T��Q�1��9�X��T����?��7�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?��>��R��T&$��A��H��Q��8��K��5����C��>��4��?��R�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4��X��Q��T#!��A߽4��Q��8��K��5����,��4��X��Q�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+MKԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��C��-��HН?̛<��&���#��#����!)'��A��H��Q��8��K��5����C��>��4��*��?���ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+SQԃP��;��H��QʡHɤU��B��U��H��MܢE��4��N��C��T۹/��8��H��M��T��>��J�T��8��:��G��3><��A��H��QʡHɤU��B��U��H����C��T۹/��8��H��T��>��J�T��:��G�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+ecԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4ԃP��;ܢE��4��N��5��NģC��F��4��Q��O�1��M��J��Eа.��TН?��>;9��A߽4��Q��8��K��5����,��4��A������Q��O��-��Eа.��T��?�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+/-ԃP��;��H��UܢE��4��N��C��R��K��D��?��TيR̛<&$��A��H��U����C��R��K��D��?��TيR̛<�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+,*��7ԃP��;��E��7��2��TܢE��4��NН?��>��A��T��7��A��E��7��T����?��A�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+86߹-�JН?̛<ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4&$��-��?��A��H��Q��8��K��5����C��>��4�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��X��Q��T)'��A��H��Q��8��K��5����C��>��4б��X��Q�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+#!ԃP��;߽4��U��L��6��.��TܢE��4��N��A߽4��U��L��6��T���ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��U��7��2��TܢE��4��NԃP��;߽4��TН?��T ��A߽4��U��7��T����A߽4��Tܞ?�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+;9ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4����0̛<��Q��T)'��A��H��Q��8��K��5����C��>��4����0��Q���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��/-��T��I�O��Tބ2��B��J��7��6��8��T��7��P��4��J#!��T��I�O��Tބ2��BќJ��6��8��7��4���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��JH��R��T��I�O��T��4�/����>��BԚ<��P��D��U���%����!��,��9��=��9��@Ԛ<;9��R��T��I�O��T��4�/����>Ԛ<��P��D��U��,��9��=��9��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��#!��T��I�O��Tބ2��B��>��T��V��>��T#!��T��I�O��Tބ2��B��>��T��V��>��T���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��SQ��R��P��4��D��3��T��MɾS��B��T��I�O��T��L��;��U��$����N��,�����%����!��@Ԛ<;9��R��4��D��3��T��M��B��T��I�O��T��L��;��U����N��,��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��86��R��9��T��I�O��T��>����B��K��1١-��J��L�;��@��@��@/-��R��9��T��I�O��T��>����B��K��1١-��8��@��@���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��tr��T��>��I�O��Tմ2�O̤@��R�O��W��BǞV��<��>��MɾS��3��D����U��J��D��P��>��W��>��5ֈD��,��D��L��9��A��D��S�D��Aki��T��>��I�O��Tմ2��@��R�O��W��BȞV��>��M��3��D����U��J��D��P��>��W��>��5ֈD��,��D��L��9��A��D��S�D��A���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��53��T��I�O��T��*��B��6��J��7��6��8��T��7��P��4��J��2)'��T��I�O��T��*��B��6ќJ��6��8��7��4��2���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��,*��T��I�O��T��>����,��:���%����!��@Ԛ< ��T��I�O��T��>����,��:��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��A?��T��7��I�O��T��>����3��D��,��R��,��S��U��U��P��4��J��@��@��@53��T��7��I�O��T��>����3��D��,��R��,��S��U��4��@��@���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��86��R��T��>��I�O��T��K��>��S��F��>����P��4��J��@��@��@)'��R��T��>��I�O��T��K��S��>����4��@��@���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��/-��T��I�O��T����B��6��2��L��C��P��4��J��>��T#!��T��I�O��T����B��6��2��C��4��>���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��MK��9��Q��D��T��7��I�O��T��>��S��F��D��U��>��F��>����;��/��?��B��RÙK��B��T><��9��Q��D��T��7��I�O��T��S��D��U��>��F��>����;��/��B��E��B���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��20��P��4��J��T��I�O��T��S��U��Xߢ?��U��,��6��X��T&$��4��T��I�O��T��S��U��X��?��6��X��T���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��20��T��I�O��T��4��7��>��4����3��D��F��D��S�D��A,*��T��I�O��T��4��>����3��D��F��D��S�D��A���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��20��T��I�O��T��B��6��J��7��6��8��T��7��P��4��B��T)'��T��I�O��T��B��6ќJ��6��8��7��4��B��T���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��hf��R��T��>��I�O��T��>����U��P��4��>�4P��=��A��N��,��:��L���%����!��*��*��P��4��>٬J��=��$��@Ԛ<SQ��R��T��>��I�O��T��>����U��4��>�4��=��A��N��,��:��L��*��*��4��>٬J��=��$��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��DB��R��P��4��J��T��I�O��T�>��M�K��J��I�O��T��K��K����D��P�D��A><��R��4��T��I�O��T�>��M�K��J��I�O��T��K��K����D��P�D��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A��%����A��G	��%��A��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M����&����'��������%��I��I��A��$��ۏ"��&����'��������%��I��I����A��G��&����'��������%��I��I����:��A��G��D�3��A��T��(����%����!����A��G}{��&����'��������%��I��I��A����&����'��������%��I��I����A��&����'��������%��I��I����:��A��D�3��A��T��V����A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A��%����A������ ��%��A��A������ ���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��C��V��2��%��0��J��%��2��C��W��F��T��O��W��W)'��%��C��V��2��%��0��%��2��W��F��T��O��9���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��B��F��%��J��W����D��G��%��A��G��@��F��:��=#!��%��<��%��J����D��%��A��@��:��=���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��MJH��D��9��G��M��>��A��Qٟ@��D��B�U��,�G߇;�G��3��M��Vٟ@��6��D��P�D��A><��DٚG��>��A��Qٟ@��D��K��,�G߇;�G��3��M��V��5��D��P�D��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M/-����A����G��%��;̽>��MŹ��(Źʿ��@��@��@)'����A����G��%��;�>Ź��(Źʿ��@��@���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��D��J��W��.��>��=��V��%��J��W����G��%����A)'��%��D��J��W��.��>��=��V��%��J����G��A��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��86��$����&���� ��C��2̙EϪJֈD��T�9��J��9��@����A��B/-������ ��C��2̙EϪJֈD��T��J��9��@����A��B��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��)'�� ��2��E��C��$����&��E̛<��0��>��W��T�� ��2��E��C����E��0��>��W��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��)'��$����&���� ��C��2��G��E��9ֈD��@Ԛ<#!������ ��C��2��G��E��9ֈD��@Ԛ<��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��;9��Sޡ8��$����&��>��&��2̙E�� ֈD��>ܤK��$��'��&��9�Q��')'��S����>��&��2̙E�� ֈD��>ܤK��ƋQ��'��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD���~��6��A��B��6��T�� ��2��EۈX��D��:ۈX��>ў7��&��B��$����&��,��&ίB��>��T��7��>��K��U��V��J�J��K��U��Q��T��I��1���R��/��0��Qec��6��A��B��6��T�� ��2��EۈX��D��:ۈX��>ў7��&��B����,��&ίB��>��T��7��K��V����Q��I��1���R��/��Q��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD����$����&��2��@�� ��8�,��T����2��@�� ��,��T��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��_]��$֗>��A��S�� ��1��9��E��Ź��4��(����>��&��2��4�� ��E��B߻WֈD��1��H��%��,�9��:�� ��>��I\Z��$֗>��A��S�� ��1��9��E��Ź��4��(����>��&��2��4�� ��E��B߻WֈD��1��H��%��,��:�� ��>��I��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��A?��$����&���� ۈX��2��@��Q��T��W��N��EܾW��,��;��P��T����,��T86������ ۈX��2��@��Q��T��W��N��EܾW��,��;ٱP����,��T��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��53��ޥ0��C��E��$����&��0��>�� ��2��EֈD��J��<��=�@,*��ޥ0��C��E����0��>�� ��2��EֈD�J��=�@���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��Hؕ7��;��E��E��@��;��Dؕ7��;��E��E��@���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��Xŷ5��D��/��D��/��Xŷ5��D��D���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D ��;��D��H��B��U��>��U��W��6��T��;��DΑB��>��U��6���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��D��H��D��H��T��;��D��D��D��T���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��>	��;��D��>���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��B��D��/��>	��B��D��>���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��=��<��T��;��D��=��T���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;ӈ5��U��D��>��D��H��D��H��;�5��D��>��D��D���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;;9��K��6������>��H��E��1��K��/��Q��4��D�G��K��I����A��B86��K��6������>��H��E��1��K��/��Q��4��G��K��I����A��B���E��1��?��0��;��E��1��?��0��;GE��D��K��O��F��H��E��1��K��/��Q��4��D�G��K��O��JܤK��>��6��D��G��@��K20���H��E��1��K��/��Q��4��G��ܤK��>��6��D��@��K��E��1��?��0��;��E��1��?��0��;#!��D��E��1ߢ?��0��8��I��D��<��B��B ��D��E��1�?��8��I��D��<��B��B���E��1��?��0��;��E��1��?��0��;��1��A�?��Iַ;	��1��A��I��E��1��?��0��;��E��1��?��0��;��E��1ߢ?��0	��E��1�?���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;PNڤ5��5ַ;��>��E��1��?��1��B��T��/��>׆B��/��1��/��6��9��I�P��T��R��;��I��@Ԛ<MKܤ5ַ;��>��E��1��?��1��B��T��/��>׆B��/��1��/��6��9��I�P��T��R��;��I��@Ԛ<���E��1��?��0��;��E��1��?��0��;GE��D��K��O��F��H��E��1��K��/��Q��4��D�G��K��O��JܤK��>��6��D��G��@��K20���H��E��1��K��/��Q��4��G��ܤK��>��6��D��@��K��E��1��?��0��;��E��1��?��0��;A?��A׆B��?��K��U��E��I��3�R��>��7��D��E��1��?��P��;��6��6��@Ԛ<;9��A׆B��?��K��U��E��I��3��>��7��D��E��1��?��P��;��6��@Ԛ<Q��E��1��?��0��;��E��1��?��0��;��1��A�?��Iַ;	��1��A��I    ���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;53��A��U��E��1��A��Iٟ@��;��N��?�9��8��5��D��@��@��@/-��A��U��E��1��A��@��;��N��?�9��8��5��D��@��@���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F)'��A�OݰF��B��F��A����S��F��>��L��S��2 ��A�O�F��F��A����F��>��L��S���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F86ݰF��B��S��F��Q��B��J��7��6��8��T��7��QݰF��B��S��F��B&$�F��F��Q��BќJ��6��8��7��Q�F��F��B���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F ��AסE��T��/ݰF��Bٟ@��3��@Ԛ<��A��T�Fٟ@��3��@Ԛ<���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F��S��F��U��R��7��T��F��U��7��T���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J��1��1�F֎T���� ��V��>б��1��1�F֎T���� ��Vб���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J/-��S��A��S��1��1�F֎T��=��>щQ��C��E��@��@��@,*��S��A��S��1��1�F֎T��=��>щQ��C��E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J)'������1��1�F֎T��=��?��N��;��7��8��K������1��1�F֎T����7��K���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J����=��?��N��;��C�;����M������C�;����M���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JJH��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��7��J��6������!A?��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��7��6�����1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J;9����1��1�F֎T��B��J��Hį-��H��U��Hڶ>��2��>��A��R��@Ԛ<;9����1��1�F֎T��B��J��Hį-��H��U��Hڶ>��2��>��A��R��@Ԛ<���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J ӪN��1��1�F֎T����E��@��@��@ӪN��1��1�F֎T����E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J86��1��1�F֎T��0��3��V��C��J��7��6��8��T��7��1��1�F֎T20��1��1�F֎T��0��3��V��CќJ��6��8��7��1��1�F֎T���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J��1��1�F֎T��@��?��1��1�F֎T��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JSQ��D��R�0��7��>��I�8Ҳ0��2��A��Xڃ��N��>��1��1�F֎T��A��K��Aٟ@�H��D��P�D��APN��D��R�0��7��>��8Ҳ0��2��A��Xڃ��N��>��1��1�F֎T��A��K��Aٟ@�H��D��P�D��A���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J53��1��1�F֎T��W��"�����)����$������I��K��4��6)'��1��1�F֎T��W��"�����)������I��4���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JJH����7��&��:֎T��1��1�F֎T����T��T��T����=��?��N��;��T����������!53����7��&��:֎T��1��1�F֎T����T��T��T������T��K���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J&$��C��E��>��1��1�F֎T��@��0��=��@Ԛ<&$��C��E��>��1��1�F֎T��@��0��=��@Ԛ<���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JPN��1��1�F֎T����=��?��N��;����7��=��?��N��;��G����T��T��T������
-�� ����!.,��1��1�F֎T��������7����G����T��T��T�+���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�Jki��1��1�F֎T��K��Sħ;��S�� ��C��9��>��>��4��K��.��T��RҲ0��A��G�B�@��>�����=��?��N��;�����)��ʪ\Z��1��1�F֎T��K��S��S�� ��Cޖ>��>��4��K��.��T��RҲ0��A��G�B�@��>����������)��ʪ���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J53����&��1��1�F֎T��R��B��O��E��V����C��E��@��@��@,*��&��1��1�F֎T��R��B��O��E����C��E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J,*����7��D��T��1��1�F֎T��6��U��=��?��N��;����7��T��1��1�F֎T��6�����1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J_]��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��$����U��-��CɤU��TҲ0�A��B������!YW��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��$����U��-��CɤU��TҲ0�A��B���86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CJH��X�,��1�9��CʡH��9��7�/��
-������N��W��=��H��D��E�8��K��D��G��@��K/-��X��1ʡH��9��7΂��N��W��/��D�8��K��D��@��K�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C\Z��-��A��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K�
-��:��K��4��8��?��:��T><��-��A��X��1ʡH��9��7����N��W��/��D�C�
-��:��4��8��?��:��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�Cb`��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��K����?����I��U��>��D��E��?��T΂:��C̛<A?��X��1ʡH��9��7����N��W��/��D��E����?����I��U��D��?΂:��C̛<�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�8��K΂:��4��T/-��X��1ʡH��9��7����N��W��/��D�8��K΂:��4��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��>��K��X��1��K����N��W��/��D��>�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CSQ��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��O��K��D��O��G��D��O��6��G20��X��1ʡH��9��7����N��W��/��D��O��D��G��D��6��G�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CJH��X�,��1�9��CʡH��9��7�/�
-������/��@��C��H��W��D��EģC��KùB��N��L,*��X��1ʡH��9��7����N޻/��W��D�CùB��N��L�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�Cqo��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��>��K����L��2��8��A��W��T��6��O�0��UP��D��7��>��6��;PN��X��1ʡH��9��7����N��W��/��D��>����L��P��A��W��6��O�0��UP��D��7��>��6��;�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C><��D��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��K&$��D��X��1ʡH��9��7����N��W��/��D��E�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CPN��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�/��K��I���/�/��C��T20��X��1ʡH��9��7����N��W��/��D��/��I���/��C��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��P��K ��X��1��K����N��W��/��D��P��K�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�8��K�/��6��T,*��X��1ʡH��9��7����N��W��/��D�8��K�/��6�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CMK��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K��:΂:��<��B��B20��X��1ʡH��9��7����N��W޻/��D�C��:΂:��<��B��B�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��P��K��G��D��O��6΂:��P��6��G;9��X��1ʡH��9��7����N��W��/��D��P��K��G��D��6΂:��P��6��G���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6#!��N��B��-��<��6��6�O��D��S�D��A��N��-��<��6��6��D��S�D��A���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6><��-��I��6��6�O��E��6��0��FǂS��H��A��V��T��J��D��8��D��A��P/-��-��I��6��6��E��6��FǂS��H��A��V��T��D��8��A���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6#!��N��B��-��<��6��O��C��8��A��9��9��N��-��<��6��O��C��8��A��9���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6��<��C��6��=�R��J��<��C��6�R��J���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6)'��D��-��I��H��D��6��/��E��6��-��1��1��6)'��D��-��I��H��D��6��/��E��6��-��1��1��6���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6 кB��-��<ԋ/��C��6��6��JƱC��TкB��-��<ԋ/��C��6��6��JϱC���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6,*��N��B��-��<�I��6��C��?����6��P����6��T&$��N��-��<�I��6��C��?����6��P����6���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��686��5��-��<��H��C��?��D��A��P��;��0��T��?��6��T��)����!&$��5��-��<��H��C��?��A��;��T��6��T��)���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/53��4��U��İU��7�/��5�.��W��@ßN��W��F�/��U��I��T20��4��U��İU��7�/��5�.��W��@ßN��W�/��U��I��T���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐWН?,*��4��U��İU��7�/��5�.��W��@ßN��W�/̐W���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/;9��U��İU��7�/�.��W��@ßN����ۏ"����(����!��U��E��T(&��U��İU��7�/�.��W��@ßN�[��U��E��T���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��9��TŇ7��I��T/-��4��U��İU��7�/��5�.��W��@ßN��9��TŇ7��I���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/;9��4��U��İU��7�/��5�.��W��@ßN��0��W��F�9��G��3��Q��T86��4��U��İU��7�/��5�.��W��@ßN��0��W�9��G��3��Q��T���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/,*��4��U��İU��7�/��5�.��W��@ßN��W��F�/)'��4��U��İU��7�/��5�.��W��@ßN��W�/���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/,*��4��İU��7�/��5��:��S��9İUН?̛<��7��T&$��4��İU��7�/��5��:��S��9İU��?��7���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/��R��>Н?��T	��R��>ܞ?���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�//-����U��İU��7�/���.��W��@ßN�1��T��7̛<,*����U��İU��7�/���.��W��@ßN��1��7̛<���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/53��4��İU��7�/��5��:��S��9İU��:��4��K����"����!,*��4��İU��7�/��5��:��S��9İU��:��4��K��"���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/;9��4��U��İU��7�/��5�.��W��@ßN��W��F�?�9��G��HН?��T20��4��U��İU��7�/��5�.��W��@ßN��W�?�9��G��/���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/GE����U��İU��7�/�.����W��@ßN��W��F�/ɴ9Н?��Tɴ9ʡH��9��?�/��T;9����U��İU��7�/�.����W��@ßN��W�/ɴ9ܞ?ɴ9��9�/��T���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/#!��4��U��İU��7�/��5�.��W��@ßN#!��4��U��İU��7�/��5�.��W��@ßN���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/86��4��U��İU��7�/��5�.��W��@ßN��W��F�/̝5̛<��Q��T20��4��U��İU��7�/��5�.��W��@ßN��W�/̝5̛<��Q���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/53��T��1��4��U��İU��7�/��5�.��:��S��9İU��W��Q��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��W��Q���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/A?��4��U��İU��7�/��5�.��W��@ßN��W��F�/��E��:��Tɴ9��:��T�;86��4��U��İU��7�/��5�.��W��@ßN��W�/��E��:ɴ9��:�;�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J20��K��F��Eڶ>��FˎW��B��D��I��K��T�����)��ʪ/-��K��F��Eڶ>��FˎW��B��D��I��K�����)��ʪ�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J&$��Sį-��K��>��J���N��T���C��T��T#!��Sį-��K��>��J�ϞN���C��T��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J кB��6��Sį-��K��I��K��T��:��KкB��6��Sį-��K��I��K��:��K�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J#!��;��Kʗ,��/��Sտ7��P��C��@��;��B ��;��Kʗ,��/��Sտ7��P��C��;��B�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J20��3��B��B��D��K��6��S��9��A��@��S��@��0��6��0��T20��3��B��B��D��K��6��S��9��A��@��S��@��0��6��0��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J/-��;��Kʗ,��/P��L��>��C��B��F�R��K�A��K��B,*��;��Kʗ,��/P��L��>��C��B��F�R��K�A��K�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J)'��;��Kʗ,��/��S��K��D͙7��I��R��N͙7��T&$��;��Kʗ,��/��S��K��D��I��R��N͙7��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J#!��S��Kб��J��7��6��8��T��7��U��>��S��KбќJ��6��8��7��U��>���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;��,��B��R��/��>��4��7��,��B��R��/��>��4��7���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;;9ѹ6��7��,��BƸ=��D��J��7��.��K��/��B��9��A��=��B��@��@��@&$չ6��,��BƸ=��D��J��*��/��B��A��@��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;;9��R��/��B��,��B��.��P��԰��'��0��V��A������Uѹ6��F��G,*��R��/��B��,��B��.��P��0��V��A��"��Uݹ6��G���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;zx��R��/��,��B��C��M��R��/��@��B��B�R��-��P��2�K��O��N��J��7��6��8��T��7��;��2��/ޟEŮ<��N��6��9��A��=��Bǭ;��HΆO��-��5_]��R��/��,��B��C��M��R��/��@��B��B�R��-��P��2�K��O��NќJ��6��8��7��;��R��6��N��A�HΆO��-��5���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;86���
-��>ß1��QİL��	��R��/Ξ)��,��B��WβI��3��I��@��K/-��>ß1��QİL��	��R��/Ξ)��,��B��WβI��3��I��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;\Z��(����<��7����N��6����B����=��G�;��3��>��7����K���������������������#!��<����K����������������R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;)'��N��6��@��4��,��B��H��A��R��/��D��@Ԛ<&$��N��@��4��,��B��H��A��R��/��D��@Ԛ<���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;/-��R��/��,��B��@��Hٟ@ʜ2��I��A��N��6��@��@��@)'��R��/��,��B��@��Hٟ@ʜ2��I��A��N��@��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;,*��/��>��,��B��J��>��,��B��J��>��,��B��A��B,*��/��>��,��B��J��>��,��B��J��>��,��B��A��B���B��7��8��;��U��B��8��;��U��B��;��U��>��C��@��K��B��;��U��>��C��@��K��B��7��8��;��U��B��8��;��U,*ʡH��9��=��7��B��;��U��>��C��E��J��<��B��B,*ʡH��9��=��7��B��;��U��>��C��E��J��<��B��B���B��7��8��;��U��B��8��;��U ��B��;��U��>��C��7��C��<��B��B��B��;��U��>��C��7��<��B��B��B��7��8��;��U��B��8��;��U53��B��7��;��U��S�L��8��7��B��9��7��7��	��N����H��&$��B��;��U��S�L��8����	��N����H�����B��7��8��;��U��B��8��;��U&$��B��7��;��>��8��N��@���>��;��G��B ��B��;��>��8��N��@���>��;��G��B��7��8��;��U��B��8��;��U,*��B��7��;��>��8��N��@Ɓ-��6��7Ɓ-��6��H��T#!��B��;��>��8��N��@ȁ-��7ȁ-��H��T���B��7��8��;��U��B��8��;��U��7��B��;��U��>��C��8�,��T��7��B��;��U��>��C��,��T��B��7��8��;��U��B��8��;��U��B��;��U�L��C��8�,��T��B��;��U�L��C��,��T���B��7��8��;��U��B��8��;��U��B��;��U��>��C��@��K��B��;��U��>��C��@��K��B��7��8��;��U��B��8��;��U)'��7��B��;��U��>��C��B��U��8��J��<��B��B)'��7��B��;��U��>��C��B��U��8��J��<��B��B���B��7��8��;��U��B��8��;��U ��B��;��U��>��C��7��C��<��B��B��B��;��U��>��C��7��<��B��B��B��7��8��;��U��B��8��;��U,*��B��7��>��;��U��N��8��C��.��V��I��<��7��; ��B��>��;��U��N��8��C��.��I��7���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/��@��?��J��.ϭB��@���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB)'��J��.ʭB�/��L����F��;��F��?��8�,��T ��J��.ϭB��L����F��;��F��,��T���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB,*��J��.ʭB�/��L����F��U��O��'��G��N��O��C&$��J��.ϭB��L����F��O��'��G��N��O��C���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB,*��J��.ʭB�/��L����F��U��O��L��B��<��B��B&$��J��.ϭB��L����F��O��L��B��<��B��B���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/��>��L��J��.ϭB��>��L���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB53��J��.ʭB�/��8��N��J��.ʭB�/�G��>��9��8��F��>��T,*��J��.ϭB��8��N��J��.ϭB�G��>��9��8��F��>���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB/-��J��.ʭB�/��8��I��C¨0��3��?��;��9��<��>��T��J��.ϭB��8��IϨ0��-��<��>���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/�;��J��6��J��.ϭB�;��J��6���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=/-��?��;��8��W��B��=��&��;��WɾS��2��S��C��I��9)'��?��;��8��W��B��=��&��;��W��2��S��C��9���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=/-��H޽B��;��8��A��E��0��W��B��=щQ��U��P��.��T,*��H޽B��;��8��A��E��0��W��B��=щQ��U��P��.���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=JH��W��B��R��P��I��9��=��5��0��<��G��0��G��8��7��W��G��Q��D��2��G��OŒA��TDB��W��B��R��P��I��=��5��0��<��G��0��G��8��7��W��G��Q��D��2��GŒA��T���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=20��P��G��,��D��N��G��8��0��6��W��B��=��C��=��S��7,*��P��G��,��D��N��G��8��5��W��B��=��C��S��7���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=GE��W��=��D��,��?��R��;��G��0��G��8��D��N��@��W��G��7ӽD��I��E��CӽD��I><��W��=��D��7��R��;��G��0��G��8��D��N��@��W��GӽD��I��EӽD��I���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=20޽B��R��0��W��B��>��=�M��>��I��?��;��8щQ��@Ԛ<20޽B��R��0��W��B��>��=�M��>��I��?��;��8щQ��@Ԛ<���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=)'��;��8��0��W��B��=��D��>щQ��D��S�D��A&$��;��8��0��W��B��=ӗ>щQ��D��S�D��A���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=JHԓ4��5��9��D��0ԓ4��B��=����S��R��J�>��E��;��8��6��S��T�!����!����";9ԓ4��5��D��0ԓ4��B��=����S��R��J�>��E��;��8��6��S��T��X���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=#!��;��8��>��E��6��Q��W��B��=��@��N ��;��>��E��6��Q��W��B��=��@��N���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=��R����8��G��8��>��=��>��P��R����8��G��8��>��=��>��P���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=VTԓ4��5��9��D��0ԓ4��B��=��O��<��G��BǄP�B�@��;��8��>׽R��G��6��S��T�!����!����"DBԓ4��5��D��0ԓ4��B��=��O��<��G��BǄP��B��;��8��>׽R��G��6��S��T��X���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��G׫;��@��2��>��H��8��G�K��0��G��8��W��B��=��F��?��H��G��,��H��,DB��G׫;��@��2��>��H��8��G�K��0��G��8��W��B��=��F��?��H��G��,��H��,���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=\Z��7��W��C����Ȼ����������2��2��H��G��/��C��N�K��0��8��W��=ߌ,��3��=��G��Gև9��>��TYW��7��W��C����Ȼ����������2��2��H��G��/��C��N�K��0��8��W��=ߌ,��3��=��G��Gև9��>���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=����H��$��,��G��G��8��8��W��-��B��G��H��H��H��$��,��G��G��8��8��W��-��B��G��H��H��Q��H��$��,��G��G��8��8��W��-��B��G��H��H��D��E��T��L��B��L��=��,��K����H��$��,��G��G��8��8��W��-��B��G��H��H��$��,��G��G��8��8��W��-��B��G��H��Q��H��$��,��G��G��8��8��W��-��B��G��H��D��E��T��L��B��L��,��K���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=86��G��,��D��N��G��8��0��6��W��B��=��C��=��Pֈ;̛<��A��T/-��G��,��D��N��G��8��5��W��B��=��C��Pֈ;̛<��A���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��T��C��R�J��G��<��8��Q��G��8��O��6��0��G��6��U��<��8��Gڶ>��S��=86��C�J��G��<��8��Q��G��8��O��6��0��G��6��<��8��G��S��=���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��S��9��I��/��C��D��<��8�J��Gԓ4��G��W��B��-��R��N��=��
-��K��F�7DB��S��9��I��/��C��D��<��8�J��Gԓ4��G��W��B��-��R��N��=��
-��K��F�7���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��20��������Ͳ��4ʉ5��/��%��D�H��G��A��A��O��C��4ˉ5��%��D�H��A��A��O��C���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5�� ��������ʉ5ޚT��D��G��@��K����5��D��@��K���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��&$��������ۚKʉ5��R��G̛<��"����&��ۚK݉5��G̛<��"���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��ʉ5����������8�,��Tʉ5����,��T���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��20��7������������ʉ5ޚT��4��L��/ȈX��<��B��B��7��5��4��L��/ȈX��<��B��B���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��,*����������H���6��=��>ʉ5��B��-��A��B#!����H���6��=��>ʉ5��B��-��A��B���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��;9��������ۚK��4ʉ5��G��8��O��E��>έ;��L�S��DʡH��9�;,*��ۚK��4ʉ5��G��O��E��>٭;�S��DʡH��9�;���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��#!��@����������>ʉ5��D��S�D��A��@����>ʉ5��D��S�D��A���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��&$������$��6����������6ʉ5��@Ԛ<������$��6����6ʉ5��@Ԛ<�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6��@��K��-��;��@��K��-��;�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��653��.��H�B��@��M��6��4��A��6�O��I��0щQ��U��P��.��T/-��.��H�B��@��M��6��4��A��6��I��0щQ��U��P��.�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6><��.��H�B��@��M��6��4��A��6�O��H��A��V��T��J��D��8��D��A��P20��.��H�B��@��M��6��4��A��6��H��A��V��T��D��8��A�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6)'��.��4�9�B�3��I��6�O��F��U��P��U��T#!��.��4�9�B�3��I��6��F��U��P��U�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6A?ڤ5��5��D��>��.��1�B��@��D��4��A��=��������@��6�O��G��;��P20ܤ5��D��>��.��1�B��@��4��A��=����@��6��G��;��P�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6GE��.��J��S��=��H�B��@��D��H��4��A��D��A��P��;��0��T��?��6��T��)����!,*��.��S��H�B��@��H��4��A��A��;��T��6��T��)�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��620��.�B��K��M��4��A��H��A��V��T��J��D��8��D��A��P)'��.�B��K��M��4��A��H��A��V��T��D��8��A�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��.��H�B��@��D��4��A��=��6�O��G��U��P��9��T#!��.��H�B��@��4��A��S��G��U��P��9���@��G��M��T	��@��G��M><��VܞN��T��>�� ���� ��B�IɤU��1��.��@��G��M��T��C�3��G��9/-��V��N��>��B�IɤU��1��.��@��G��M��C�3��G��9��@��G��M��T	��@��G��MDB��@��G��.��M��T��A�/��B��@��G��.��M��T��Q��8ޚT��N��G��K��T��O��T,*��@��G��6��A�/��@��G��6��Q��8��+��K��T��O���@��G��M��T	��@��G��M;9��@��G��.��M��T��,��;��M��T��7��3��;��E��=��5��7��T��Iַ;)'��@��G��6��,��;��M��7��;��E��5��7��T��I��@��G��M��T	��@��G��M)'��@��G��@��M��T�/��-��5��6��P��9�?ַ;#!��@��G��@��M�/��-��5��6��P��9��?���@��G��M��T	��@��G��M����@��G��.��M��T��G��@��=��@��G��M��T��.��@��M��T��C�3��G�3��G��9ܞN��T��T��O��C�3��G��9��8��Iַ;��@��G��.��M��T��G��@��=��@��G��M��T��.��@��M��Tki��@��G��6��G��=��@��G��M��.��@��M��C�3��G�3��G��9��N��T��C�3��G��9��I��@��G��6��G��=��@��G��M��.��@��M��@��G��M��T	��@��G��M86��@��G��.�4��@ϚL��4��M��T�;��M�4߹-��W��Hԓ6��Iַ;&$��@��G��.��@��4��M��6߹-��W��Hԓ6��I���@��G��M��T	��@��G��M&$��@��G��.��M��T߹-�5��T��O��O��Iַ;��@��G��6߹-�5��T��O��I��@��G��M��T	��@��G��M����.��@��M��T��.��M��T��@��M��TܞN��D��>��.��M��T��E��=��.��M��T��=��.��M��T�IϪJ��1��.��M��@��G��.��@��M��T��D��C�3��G��9��8��Iַ;\Z��.��@��M��6��@��M�N��>��6��E��=��6��=��6�IϪJ��1��.��M��@��G��.��@��M��D��C�3��G��9��I���@��G��M��T	��@��G��M&$��.��M��@��G��M��T�J��-��U��@ؙD��T#!��.��M��@��G��M�J��-��U��@ؙD��T��@��G��M��T	��@��G��M)'��'��=��.��@��G��M��T��I��B��.��<��B��B#!��'��=��.��@��G��M��I��B��<��B��B���@��G��M��T	��@��G��M\Z��.��M��@��G��M��T��.��M��T��.��@��M��T��@��M��T��E��M��T��=��.��M��T��C�3��G��9��8��Iַ;><��.��M��@��G��M��6��.��@��M��@��M��E��M��=��6��C�3��G��9��I��@��G��M��T	��@��G��M/-��@��G��=��@��G��.��M��T��=��.��M��T�I��G��@ ��@��G��=��@��G��6��=��6�I��G���@��G��M��T	��@��G��M><��VܞN��T��>�� ���� ��B�IɤU��1��.��@��G��M��T��C�3��G��9/-��V��N��>��B�IɤU��1��.��@��G��M��C�3��G��9��@��G��M��T	��@��G��MDB��M��U��@��G��@��M��T��@��M��T��M��T��M��U��,��H��P��5ѳBʈF��P��?53��M��@��G��@��M��@��M��M��M��,��H��P��5ѳBʈF��P��?���@��G��M��T	��@��G��M;9��@��G��.��M��T��,��;��M��T��7��3��;��E��=��5��7��T��Iַ;)'��@��G��6��,��;��M��7��;��E��5��7��T��I��@��G��M��T	��@��G��M53��@��G��.��M��Tַ;��@��G��.��M��T��D��,��B��Pַ;Υ6&$��@��G��6ַ;��@��G��6��D��,��Pַ;Υ6c    ��I��6�;��0ڳQ	��+��0ڳQ ��I��6�;ٟ@�9ٟ@��0��A��@Ԛ<��+��9��0��A��@Ԛ<K    ��I��6�;��0ڳQ	��+��0ڳQ��I��6�;��-��N	��+��-��N�    ��I��6�;��0ڳQ	��+��0ڳQ20��I��6�;��0ʭBќ:��-��W��I��6�;��I��6��>��S��2&$��+��0ʭBќ:��-��W��I��6��I��6��>��So    ��I��6�;��0ڳQ	��+��0ڳQ&$��U��I��I��6�;��-��N�1��D��@��@��@��U��I��+��-��N�1ځD��@�    ��I��6�;��0ڳQ	��+��0ڳQSQ��������I��6�;��0��9��6�W��I��-��:��P��U��PޜF��T��IP��R��M��T��I��6ޜF��6JH��������+��0��9��6�W��I��-��:��P��U��PޜF��T��I��R��M��T��I��6ޜF��6�    ��I��6�;��0ڳQ	��+��0ڳQA?��Q��2�?��E��C��=��E��@��.��=��9�Q��C��B��9�Q��C��ͦ(����!)'��Q��2�?��E��C��=��@ƋQ��C��BƋQ��C��i    ��I��6�;��0ڳQ	��+��0ڳQ ��.��I��W��I��6�;��8�T��A��B��.��I��W��+��8�T��A��B�    ��I��6�;��0ڳQ	��+��0ڳQ86��I��6�;��6��U��=�9��=��>��C�<ʡH����6��I��H�<��T&$��+��6��9��>��C�<ʡH����6��I��H��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;GE��<��M��N��L��;��;��T��B��T��4��B��T��/��R��6��G��U��K��P��9��PگD��T><��<��M��N��L��;��;��B��4��B��T��/��R��6��G��U��K��9��PگD��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;53ä=��F��9��E��N��L��<��M��N��L��M��T��M��=��E��P��>,*ä=��F��B��N��L��<��M��N��L��M��T��M��E��P���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;����.��M�@��D��>��3��PϪJ��B��E҄J�J��9��R��>�9ֈD��C��S��W��9ٟ@��1��9��2��D��>��9��E��<��M��N��L��A��M��7��S�9��=�>��D��>��9��@��S��6��;��,��D��P��>��=��/��U��P��.��T����.��M�@��D��>��3��PϪJ��B��EԄJ��9��R��>��D��C��S��W�@��1��9��2��D��>��B��<��M��N��L��A��M��S��=�>��D��>��9��@��S��6��;��,��D��P��>��=��/��U��P��.���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;YW��J��9��E��<��M��N��L����5��?��J����7��7��E��B��=����H��Q��2��8����@ǆ9��V��T��P��HSQ��J��B��<��M��N��L����5��?��J����7��7��B��=����H��Q��2��8����@ǆ9��V��T��P��H���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;JH��D��9��>��6��E��>��<��M��N��LëO��8��2޽B��4��;щQʡHб��6��T��H�<��TGE��D��9��>��6��E��>��<��M��N��LëO��8��2޽B��4��;щQʡHб��6��T��H��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;#!��9��EʕV��<��M��N��LʕV��6��@Ԛ< ��BʕV��<��M��N��LʕV��6��@Ԛ<���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;PN��N��A��=��<��M��N��L��;��;��T�Rڶ>��S��E��>��C��=��,��B��/��7Ȼ;��T��=��.��LGE��N��A��=��<��M��N��L��;��;��T�R��S��E��>��C��,��B��/��7Ȼ;��T��=��L�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G53��>��W��5��CȥW��G��8��E��<����=��?��N��;M�8��T)'��>��W��5��CȥW��G��8��E��<����M��8�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��Ghf����;��>����>��WȥW��,��:��K��>��;����=��?��N��;����7��=��?��N��;��G����T��T��T������
-�� ����!FD����;��>����>��WȥW��,��:��K��>��;��������7����G����T��T��T�+�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>��W��5��C����WȥW��G��8��E��<��=��?��N��;����T��T��T��G�8̛<86��>��W��5��C����WȥW��G��8��E��<������T��T��T��G��8�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20��ȥW��>��W��2��G�/��I֣.ŞG��9�/��;����7�;20��ȥW��>��W��2��G�/��I֣.ŞG��9�/��;����7�;�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20����>����>��Q��R��@��8��S֗T��7��ȥW��@��@��@/-����>����>��Q��R��@��8��S֗T��7��ȥW��@��@�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20����>��ȥW��S��8��D��0��;����T����=��?��N��;)'����>��ȥW��S��8��D��0��;����T���������ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G#!��ȥW��ȥW��K��ȥW��,��:ĝ�� ��ȥW��ȥW��K��ȥW��,��:؝�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G_]��N��9��U��L��=��>˾3��ȥW��>����G��/��N��Iǡ6����T��T��T����=��?��N��;��T����������!DB��N��U��=��>˾3��ȥW��>����G��/��N��Iǡ6����T��T��T������T��K�	��?�J�=	��?�J�=)'��V��H��D�J��>��4�=��5��D�3Ȼ;��>��T ��V��D��>��4�=��5��D�3Ȼ;��>	��?�J�=	��?�J�=��D�J��>��?��=��D�J��>��?��=}	��?�J�=	��?�J�=��?�J�=��Uа.��T��?�J�=��*	��?�J�=	��?�J�=�J��?��,��=�J��?��,��=�	��?�J�=	��?�J�=;9��?��E�J�=׍Q��7��E��7��0��	��NʡH��
-��H��0��6��4��T��53��?��E�J�=׍Q��,��0��	��NʡH��
-��H��0��6��4��T��	��?�J�=	��?�J�=��D�J��>��?��=��GĊA��>��T��D�J��>��?��=��GĊA��>�	��?�J�=	��?�J�=/-��D��9��D��D�G��?��>�J��>��,��N��D��S�D��A#!��9��G��?��>�J��>�=��D��S�D��A	��?�J�=	��?�J�=��?��E�J׍Q��D��G��@��K��?��E�J׍Q��D��@��K�	��?�J�=	��?�J�=��D��/��F��;	��D��F��;	��?�J�=	��?�J�=,*��D�J��>��?��=��E��?��N��K��L��F��9��@��K)'��D�J��>��?��=��E��?��N��K��L��F��9��@�	��?�J�=	��?�J�=��?��=��E�J�=׍Q��P��B��6��?��=��E�J�=׍Q��P��B	��?�J�=	��?�J�=����;��?��1��K��E�J��>�=׍Q��C��P��D��C��K��9��K��>ٟ@���9��@��9��W��>��4��R��/ҾW��B��1��.�O��>��N����B�9��K��J��K��>��N�9͝,ڪ3��.��WȻ��B�D��E��A¶7ģC��:��Q����;��?��1��K��E�J��>�=׍Q��C��P��D��C��9��>ٟ@���9��@��9��W��>��4��R��/ҾW��B��1��.�O��>��N����B��8��J��>��N�9Ν,��.��WȻ��B�D��E��NģC��:��Q�	��?�J�=	��?�J�=20����?����>��?�J��>��,��N��1��6��6��=��=��@Ԛ<)'����?����>��?�J��>�=��1��6��=��@Ԛ<	��?�J�=	��?�J�=��D�J��>��?��=��@��K��D�J��>��?��=��@��K�	��?�J�=	��?�J�=><��D�J��>��?��=��4��F��S��CܞN��/����O��������J��-��0��E/-��D�J��>��?��=��4��F��S�N����O����J��7��E	��?�J�=	��?�J�=��?�J�=��4Н?��A��3��A��T��?�J�=��4��A��A�	��?�J�=	��?�J�=)'��V��H��D�J��>��4�=��5��D�3Ȼ;��>��T ��V��D��>��4�=��5��D�3Ȼ;��>	��?�J�=	��?�J�= ��?��E�J�=׍Q��F��K��	��A��B ��?��E�J�=׍Q��F��K��	��A��B�	��?�J�=	��?�J�=��?�J�=��Uа.��T��?�J�=��*	��?�J�=	��?�J�=)'�J��>��?�=ʡH۩R��V��-��T��.��6��.��T&$�J��>��?�=ʡH۩R��V��-��T��.��6��.�	��?�J�=	��?�J�=;9��?��E�J�=׍Q��7��E��7��0��	��NʡH��
-��H��0��6��4��T��53��?��E�J�=׍Q��,��0��	��NʡH��
-��H��0��6��4��T��	��?�J�=	��?�J�=20��U��W��X�=��6��?��K��J�J�=��3��WН?��>��A��T,*��U��W��X�=��6��?��K��J�J�=��3��W��?��A�	��?�J�=	��?�J�=/-��D��9��D��D�G��?��>�J��>��,��N��D��S�D��A#!��9��G��?��>�J��>�=��D��S�D��A	��?�J�=	��?�J�=86��4��?߸3ѝ6��B��5��-��0��I�J��?߸3�=��=��I̛<��Q��T20��4��?߸3ѝ6��B��5��0��I�J��?߸3�=��=��I̛<��Q�	��?�J�=	��?�J�=��D��/��F��;	��D��F��;	��?�J�=	��?�J�=)'��D�J��7��?��>��=����F��>��>��@��>��T#!��D�J��7��>��=����F��>��>��@��>���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>&$��1��F��E��W��K��.��W�K��C��:��E��T��1��F��W��K��W�K��C��:��E���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>;9��I��F��E��A��W̋?�6��F��F��1��U�K��>�6�2��6��:��:��@20��I��F��A̋?�6�.��1��U�K��>�6�2��6��:��:��@���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>&$��I�K��M��F��E��-��>��CϨH��Q��R��T��I�K��M��F��-��CΨQ��R��T���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>20��I��F��E��D��6��A��S��1��F՟?��>��>��D��S�D��A)'��I��F��D��6��Aū1��?��>��>��D��S�D��A���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>53��F��E��>��>��M��*��ɬI��*��I��*��5��5��T��H�<��T,*��F��>��>��M��*��ɬI��*��I��*��5��T��H��T���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>53��H����F��E��>��>��@��I��U��>��J��-��F�>��T��L��P20��H����F��>��>��@��I��U��>��J��-��F�>��T��L��P���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>20��IP��=��E��>��>��F��E��D��H��>��Q��I��B��,ܔN)'��I��=��E��>��>��F��D��H��>��Q��I��B��G���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��4��P��@Ԛ<��:��;��4��P��@Ԛ<���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;�
-��:��O��;��W��L�/��?��T�
-��:��;��W��.���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��O��:��4��;��D��G��@��K��O��:��;��D��@��K���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O�D��>��;��@��K��:�D��>��;��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��D��O��;��2	��D��;��2���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��; ��:��O��;��2��,��L��D��G��@��K��:��;��2��,��D��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��2	��:��;��2���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��8�,��T��:��;��2��,��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;&$����:��OƔ>��;��2�1��E��T��!����!����:Ɣ>��;��2�1��E��T�����D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��8�,��T��:��;��2��,��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��; ��P��:��O��8��;��:��I̺@��:��T��P��:��8��;��:��@���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;#!��:��O��;��J��:��O��4��9��7��4��T��:��;��J��:��4��7��4��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��D��O��;��2��:��T��D��;��2��:��T���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��D��G��@��K��:��;��2��D��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��2��7��C��<��B��B��:��;��2��7��C��<��B��B���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;�I��@��K��:��;�I��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;#!��:��O��KخG��5��K��;��D��G��@��K��:��KخG��5��K��D��@��K���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?&$��/��I��M��T��S��;ͺ?ٟ@��6��A�7��B��I��T��S��;ͺ?��5��+���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?&$��(��T��S��;��>��6��/��I��M��@��@��@��(��T��S��;��>��6��I��@��@���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?86��/��I��M��T��R��;��>��>��V��Bͺ?�C��7�=��V��-��A��B)'��I��T��R��;��>��>��Bͺ?�C��7��V��A��B���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?��(��T��S��6��4ͺ?��(��T��S��6��4ͺ?���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ? ��/��I��M��T��;ͺ?��D��S�D��A��I��T��;ͺ?��D��S�D��A���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?A?��/��I��M��P�D��;��Fͺ?��M��7��K��/��1�I��-�I��-������@Ԛ<53��I��P�D��;��Fͺ?��M��K��/�I��-�I��-������@Ԛ<���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ? ��/��I��M��F̽>��S��6��>��N��B��I��F̽>��S��6��>��N��B���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?53��;ͺ?�9��T��.��/��I��/��J��@��/��T��A��/��I��M��T,*��;ͺ?�9��T��.��I��/��J��@��/��T��A��I��T���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?86��/��I��M̺ٟ@��6ʔ7��;��Vͺ?��2��(��/��I��M����I��@)'��I̺��5ʔ7��;��Vͺ?��2��(��I����I��@��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8&$��U��J�G��>�S��I��B��E��U��3��H��8��U�G��>�S��I��B��8��H��8��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8zx��7��HܞN��D�G��>�S��E��U��7��HܞN��D�G��>�S��E��U��Q��7��HܞN��D�G��>�S��E��U��D��E��T߹-��8��Lԓ6��Iַ;��C��=��.b`��7��H�N�G��>�S��8��7��H�N�G��>�S��8��Q��7��H�N�G��>�S��8��D��E��T߹-��8��Lԓ6��I��C��=��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8ki�/��K��@�G��>�S��E��U�S��T��S��U��Q��=��W��B�S��E��U�S��I��B��E��U߹-��=��E��M��S��Iַ;��B��U��1��TPN�/��K�G��>�S��8�SŘ<��Q��=��W��B�S��8�S��I��B��8߹-��=��E̠M��I��B��U��1��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8DB��-ܞN��D�G��>��!������)�S��E��U��Q��-����Q�;ۓR��T��C��G�0/-��-�N�G��>�S��8��Q��-����Q�;ۓR��C��G�0��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��886��R�N��U�G��>�S��E��U��I��B�S��E��U��)����:�/��B#!��N�G��>�S��8��I��B�S��8��:�/��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8/-ܞN��D�G��>�S��I��B��E��U�;�S��I��B��E��U&$�N�G��>�S��I��B��8�;�S��I��B��8��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8SQ��-ܞN��D�G��>�S��E��U��Q��D��2��V��Fȣ8��4�X��I��UҔB��<֗T��I�7��Iַ;ŒA��TJH��-�N�G��>�S��8��Q��D��2��V��Fȣ8��4�X��I��UҔB��<֗T��I�7��IŒA��T��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8����P����P��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��820��D�G��I��B��E��U�S��E��U��V��;��E��U��B��E��U#!��D�G��I��B��8�S��8��V��8��B��8� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>#!��B��C��D��O��9��4��>��6�O��@Ԛ<��B��R��O��9��4��>��6��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>20��D��C��D�9�7��U��D��E��4��Oٟ@��6��A��A�7��B ��D��R��5��U��D��M��Oٟ@��6��+� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>PN��D��B��C��D��Iٟ@��9��6��4��E��>йS��D��K�9ٟ@��9��S��M��>��B�U��-щQ��@Ԛ<><��D��B��R��@��5��4��E��>޹S @��9��S��M��>��B�U��-щQ��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>PN��O��D��6��>��D��=��7��A��D��B��D��C��D��=�9�>��D��Iٟ@��O��D��2�O��@��@��@><��O��D��6��>��D��=��7��A��B��R��9�>��D��@��O��D��2�O��@��@� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>b`��D��B��D��C��D��Cٟ@��9ɤK��E��7��>��RɤK�/ϪJ��>��H��=��Q��9��9��6ɤK��A��>��A�9��1��0��T��DPN��B��R��@��9ɤK��E��7��>��RɤK�/ϪJ��>��H��=��Q��9��5ɤK��A��>��A��1��0��T��D� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>_]��D��B��C��D��N��5�9��O��H��3��4��8��B��D��4��R��4��O��@��4��W��OŮP��O��4�/��T��D����O��TDB��D��B��R��N��5��O��3��8��B��D��4��M��O��@��4��W��X޵+��T����O��T� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>#!��B��C��D��9��4��>��A��6�O��@Ԛ<��B��R��9��4��>��A��6��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��> ��U��C��D��9��4��>��A��6��?��,��U��R��9��4��>��A��6��?��,� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P/-��B؇9��6˩5ֲR����1��F��Q�?ٟ@��S��P��G��3&$��B؇9��6ֲR����1��F��Q�?ٟ@��S��G� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P#!��B��O��F��R��6˩5֛7��>��3��P��J��B��O��F��R��6�7��3��P��J� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��Pki��B��T��S��6˩5��0��Q��N�?�9��H��9��R��I��J��IН?��T��X��L��I��/��I��/��I��/��B��=��6��I��6��B��=��-��0YW��B��S��6��0��Q��N�?�9��H��9��R��I��J��Iܞ?ɜX��I��/��I��I��/��B��=��6��I��6��B��=��0� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PDB��0�5��OȨK��F��D�9��I��V��B��T��E��LȨK��F�9��I��V��:��TН?��>/-��0�5��OӨK��D��I��V��B��E��LӨK��I��V��:��?� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��B��T��S��6˩5��0��B��T��6˩5��1��T��7��H��;��T#!��B��S��6��0��B��6��1��7��H��;��T� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P ��B��T��6��6˩5��0��Q��GН?��>��B��6��6��0��Q��G��?� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P��B�R��6˩5��1��?��F��B��T��B�R��6��1��?��B��T� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P ��B��T؇9��6˩5��M��5��R��F��F��B؇9��6��M��5��R��F��F���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��.��T��G��T��6��>��?��>��P��.��T��G��6��>��?��>��P���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��@��N��>��P��C��@��N��>��P���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T;9��G߹-��.��T��G��TޚT��>��9��B��K��R�9��KϋI��LK��A��B20��G߹-��.��T��GޚT��>��B��R�9��KϋI��LK��A��B���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T)'��.��T��G��T��6��>��7��K��M��?��U��>��T ��.��T��G��6��>��7��K��M��?��,���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T20��.��T�9��Kʉ5�5��>��A��>��B��K��=��U��;Н?��T)'��.��T�9��Kʉ5�5��>��A��>��B��U��;ܞ?���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T/-��.��T�9��Kʉ5�5��>��A��>��B��K��=��3��R��T&$��.��T�9��Kʉ5�5��>��A��>��B��3ҔR���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��TDB��O��<��>��T��R��I��O��.��T��R��I��O�V��T��I��O��B��<ȬT��I��Q��>86��O��<��>��T��R��O��.��T��R��O�V��T��I��O��B��<ЬT��Q���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��TMK��.��T��G��T��9��.��D�S��>�9��>��A��K��@P��B��@��	��A��6�O��:��@��@��@><��.��T��G��9��.��D�S��>��I��A��K��@��B��@��	��A��6��:��@��@���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��.��T��G��T��6��>��@Ԛ<��.��T��G��6��>��@Ԛ<�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0;9̾-�,��4��F��E��4�A��J��T��5��4��T��?��1��W��Q̛<��7��T/-;-��4��E��4�A��J��T��5��4��T��?��W��Q̛<��7�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0PN̾-��-��H��,̾-�,��6��.ΩW��4��L��5��/��B��4��W��5�H��/��O��T��A��Nя7��>��1GE̾-��-��H��,;-��6ΩW��4��L��5��/��B��4��W��5�H��/��O��T��A��N��>��1�̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��7��H��9��8��4�A��J��T��5��4��>��0ޡ8��7 ̾-�,�A��J��T��0ޡ8;-�A��J��T��0̾-��/��X��T̾-��/��X��T�̾-�,�A��J��T��0ޡ8;-�A��J��T��0	̾-��C��T	̾-��C��T̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��H��8��4�A��J��T��5��4��0ޡ8��>��1��@��K)'��H��8��4�A��J��T��5��4��0��>��1��@��K�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z��H��,̾-�,��XΩW��8��4�A��L��T��5��4��0ޡ8��>��1��1��D��D��>��7��U��	̾-��X̾-��X��-��TMK��H��,;-��X��8��4�A��L��T��5��4��0��>��1��1��D��>��7��U��	̾-��X̾-��X�-�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0JH��8�A��J��T�O��C��6̾-�,��8��4��L��5��/��T��S��:��-�1��Q��B��U��/��;86��8�A��J��T�O��C��6;-��8��4��L��5��/��S��:��-ڠ#��/�̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��7��H��9��8��4�A��J��T��5��4��>��0ޡ8��7 ̾-�,�A��J��T��0ޡ8;-�A��J��T��0/-̾-��/��/��?��8��4�A��J��T��5��T��;��U��/��T&$̾-��/��/��8��4�A��J��T��5��T��;��*�̾-�,�A��J��T��0ޡ8;-�A��J��T��0	̾-��C��T	̾-��C��T̾-�,�A��J��T��0ޡ8;-�A��J��T��0;9̾-�,��E���A��J��T����T�DɍP��M��A��:��7��.��U��/��T/-;-��E���A��J��T����T�DӍP��A��:��7��.��*�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0��I��L��I��L�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0&$̾-��4�A��T��9��5��/��?��V��/��?��T ̾-��4�A��T��9��5��/��@��?��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?��R��6������!��8�,��T��R��6����,��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?53��1��K��>��Q�P��?��F��:��Bб��4��D��=��3��-��A��B,*��1��K��>��Q�P��?��Bб��4��D��=��-��A��B�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?;9��4��F��:̔6��B��U��P��V��7����1��5��C��S��?��F��:��@Ԛ<20��4��:̔6��B��U��P��V��7����1��5��C��S��?��@Ԛ<�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?&$̔6ַ;�I��B��U��Vԋ/��C��S��?��F��:��1�I��B��U��Vԋ/��C��S��?�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?GE��F��:̔6��B��U��>ȣ8��9��0��7����1��K��K��6��S��?��F��:��D��S�D��A><��:̔6��B��U��>ȣ8��9��0��7����1��K��K��6��S��?��D��S�D��A�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��? �D��A��7�O��=P��Rߑ4��P��T�D��A��7��=��Rߑ4��P��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��? ��?��F��:��6��S��>��J��<��B��B��?��6��S܃>��<��B��B�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?��?��F��:��6��S��@��K��?��6��S��@��K���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/#!��0�-��0��:��Nٟ@�H�F��V��F��T��0�-��:ٟ@�H��F��F��T���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/,*��NԚ<��B��>��U��N��D��8��F��/��?��P��S��6 ��1��B��>��U��D��F��?��P��S��6���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��//-ӟ;��N��@��R��>��8��F��S��/��"ҥ3��!��@��;��6&$ӟ;��N��R��>��8��F��S��"ҥ3��!��@��6���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/#!��U��NۚK��/��D��8��F��D��S�D��A ��U��N��/��D��8��F��D��S�D��A���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/&$����F��S��5��<��U��T��=��N��@��>��/����F��S��<��Uǃ=��N��>��/���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/)'��MƛK��U��2��Q��T��5��D�H��F��/��@Ԛ<#!��MƛK��U��2��T��D�H��F��/��@Ԛ<���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/><��U��N��@�9��O����F��/��P��K��Sϥ%��U��N��@��S��/�4��3��D20��U��@��O����F��/��P��K��Sϥ%��U��N��@��S��/��4���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/��8��F��E��Nڜ>��/��@��K��8��F��E��Nܜ>��@��K�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A86��,ݠ.��>��O��/��1��9��O��6��1��6��A��B��T��G��A�7��B/-��,��>��O��/��1��9��O��6��1��6��A��B��T��G��+�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A��O��,ݠ.��B��:��D��G��@��K��O��,��B��D��@��K�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A ��C��N��,ݠ.��Q��A��D��P�D��A��C��N��,��Q��A��D��P�D��A�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A)'��Iַ;��D��N��0��C��T��,ݠ.��AщQ��@Ԛ<��Iַ;��D��N����AщQ��@Ԛ<�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A,*��>��T��,ݠ.��9��A��B��A�A��4˛5��D�A��4��>��,��9��A��B��A˛5��D��An	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A��N��T��,ݠ.��Nĵ*�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��APN����X��>��T��9�;��;��>��X��>��Q��A��7��A�O��7��R��N��;��X��7��:��U��>��E�8DB����X��>��;��>��X��>��Q��A��7��A�O��7��N��;��X��7��:��U��>��E�8�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��AA?��O߹-��5��,ݠ.߹-��,ݠ.��:߹-��HİU��M��A��N��C�)��O��8�,��T53��O߹-��5��,߹-��,��:߹-��H��M��A��N��C�)��O��,��T���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D ��IֈD��N��0��D��:��D��G��@��K��I��N��0��D��D��@��K���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D��IֈD��G��C��?��D��I��G��C��?��D���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D��D��:��IֈD��1��4	��D��I��1���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D><يR��IֈD��:��0��DيR��4��IֈD��:��0��B��IيR��4��T��C��,��>)'يR��I��:��0��D�R��I��:��0��B��I�R��C���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D;9��IֈD��>��0��E��F��R��4��:��0��>ğCѭDӮD��:ٟ@�H��@Ԛ</-��I��>��0��E��F��M��:��0��>ɟCܮDٟ@�H��@Ԛ<���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D,*��IֈD��:��0ߢ?��D��T��7��N��7��9��U��A��T#!��I��:��0ߢ?��D��7��N��7��9��U��A���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D><��0��IֈD��:��0��D��0��4��IֈD��:��0��B��I��0��4��T��C��,��>/-��0��I��:��0��D��0��4��I��:��0��B��I��0��4��C���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>;9��E��8��7�C��C��@��N��.��H˱U����=���F��CסE��@��@��@20��8��7�C��C��@��N��.��H˱U����=���F��C��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>GE����=��
-��>��C��@��N��7��U��0��>ٟ@��6��M��V�I��W��>��E��D��S�D��A><����=��
-��>��C��@��N��U��0��>ٟ@��6��V��=��>��E��D��S�D��A���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>A?��C��@��N����=��
-�F��>��E��Mӛ?ߤ8��>��4��F��C��@��N��@��@��@;9��C��@��N����=��
-�F��>��E��Mӛ?ߤ8��>��4��C��@��N��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB����=��
-��>��CסE��C��@��N��D��E��0��*��*ԑ4��9��A��*��/��@��@��@;9����=��
-��>��C��C��@��N��D��0��*��*ԑ4��9��A��*��/��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>b`��B˩5��5����=����>��O��*��7��C��@����N��7��C��@��ĕ6��T����F��R��/��H����F��H��4��ĕ6��TPN��B˩5��5����=����>��O��*����C��@��ĕ6��T����F��R��/��H����F��4��ĕ6��T���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>_]��6ɵO����=����>��C��@��Nð.��A��
-��>��>ٟ@�9ٟ@��D��DܢE��SܤK��A��@��CסE��SܤK��A��@Ԛ<SQ��6ɵO����=����>��C��@��Nð.��A��
-��>��>��9��DܢE��SܤK��A��@��C��SܤK��A��@Ԛ<���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>/-��7��C��@��N��7��
-��=���F��S������ÐW��7#!����
-��=���F��S������ÐW��7���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>><����=��
-�F��>��C��@��N��P��E��Mӛ?��M��>��>��Fٟ@��6��@Ԛ<;9����=��
-�F��>��C��@��N��P��E��Mӛ?��M��>��>��F��5��@Ԛ<���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>)'��C��@��N����=��
-�F��(��D��>��M��@Ԛ<)'��C��@��N����=��
-�F��(��D��>��M��@Ԛ<����H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/JHùB��L��W¶7��/��J��7���H��>��/��B��/����Wȥ��O��B��4��7��8��4�/��:ĹB��N��/��J��8��4��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/,*���H��S��/��O��N��4�/��:ҁX��?��L��-��T#!���H��S��O��N��4��/ҁX��?��L�-���H��S��/��O��N��4�/��:���H��S��O��N��4��/&$���H��W��>��S��/��9��4�/��:��AƭI���H��W��>��S��9��4��/��A����H��S��/��O��N��4�/��:���H��S��O��N��4��/)'��7���H��/��B��/��>��O��N��4��7�/��:��/���H��S��/��O��N��4�/��:���H��S��O��N��4��/;9���H��>��S��/��>��4��N��O�/��:��/��/��9¶7��JùB��L��/#!����>��4��N��O��/��/��N��JĹB��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/;9��7���H��>��S��/��>��O��B��4��7��B��R��:����0����A��B��B��R��:����0����A��B���H��S��/��O��N��4�/��:���H��S��O��N��4��/20�/��:��7���H��>��S��/����O��B��4����7�/��:��/��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/&$���H��>��S��/��O��B��4��"����>������O��B��4��>������H��S��/��O��N��4�/��:���H��S��O��N��4��/,*���H��S��/��O��N��4�/��:ҁX��?��L��-��T#!���H��S��O��N��4��/ҁX��?��L�-���H��S��/��O��N��4�/��:���H��S��O��N��4��/PN��7���H��S��/��4��7�/��:����ȥ������Ƕ,��W¶7��/��>��;��������G��B20��/����ȥ������Ƕ,��N��/��>��;��������G����H��S��/��O��N��4�/��:���H��S��O��N��4��/)'��7���H��/��B��/��>��O��N��4��7�/��:��/���H��S��/��O��N��4�/��:���H��S��O��N��4��/kiùB��L��9¶7��/��J��7���H��S��/����9ȥ��4��N��O��7�/��:����9¶7��/�/��:����6��6ȈX��4��������&20ĹB��N��/��J��/����N��/��/����6��6ȈX��4��������H��S��/��O��N��4�/��:���H��S��O��N��4��/;9��7���H��>��S��/��>��O��B��4��7��B��R��:����0����A��B��B��R��:����0����A��B���H��S��/��O��N��4�/��:���H��S��O��N��4��/><ʡH�U٨I��7���H��S��/��4��7�/��:������:��,��A��F��>�� ʡH�U٨I��/������:��,��>������H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/��H��N�1��,��;��T��L��H�1��,��;��T��L���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��686��X��:��8��6˩5��4��X��:��8��6��4��V��D��T����(����!)'��X��:��8��6��4��X��:��8��6��V��D��T�����X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6)'��E��8��:��X��6��6˩5�H��3��8��@��@��@#!��E��8��:��X��6��6�H��3��8��@��@���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��620��X��:��8��6ӻB��O��X��:��8��6˩5��Q��4��6��4��T#!��X��:��8��0��X��:��8��6��Q��6��T���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6GE��X��:��8��6ӻB��O��X��:��8��6ӻB��O��X��:��8��6��H��6��T��$������!20��X��:��8��0��X��:��8��6��O��X��:��8��6��6��T����X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6#!��5��X��:��8��>��6˩5��6�R��@Ԛ< ��5��X��:��8��>��6��6�R��@Ԛ<�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>߹-��U�.��8߹-��U�.��8�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>/-��U�.��>��D��P��?��1��4��:щQȻ;��T��=��.��L)'��U�.��>��D��?��1��4��:щQȻ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>)'кB��U�.��6��:��D��P߇;Ȼ;��T��=��.��L#!кB��U�.��6��:��D߇;Ȼ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��> ��U�.��8߹-��U�.��.ʺB��P��T ��U�.��8߹-��U�.��.ʺB��P��T�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��> ߹-��U�.��:��/�0��E��F��T��6 ߹-��U�.��:��/�0��E��F��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>&$��U�.����V��P����1��B��,��,��	��5&$��U�.����V��P����1��B��,��,��	��5�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>߹-��V��;��T��6��4߹-��V��;��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>MK߹-��U�.�6��>��P��5��,�A߹-��U�.˭V�6��,��3��T߹-˭V�6܈I��U��?��9�0GE߹-��U�.�6��>��P��5��9߹-��U�.˭V�6��,��3��T߹-˭V�6߈I��?��9�0�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>MK����N����=��.��H����=���F��0��B��U�.��$��D��:����N����=��M��P��M��PJH����N����=��U����=���F��0��B��U�.��$��D��:����N����=��M��P��M��P�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>/-߹-��U�.��D��I��4��2��9��-��D��I��V��=�R��J#!߹-��U�.��D��4��2��9��-��+�R��J�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>53߹-��U�.��/�0��Bб��D��D�7��=��E��U��T۹/��U��D,*߹-��U�.��/�0��Bб��D��D�7��,��U��/��D�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>;9߹-��U�.��0��N��U��Oބ2��E������	����	��=ĪC��'��A��B&$߹-��U�.��0��N��U��OǷ.����=��A��B�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>SQ��:��Aб��=����>��U�.��=��9��V��>��D��>�9Ԛ<��O��I��SÄN��2��6��8�9��F��T��6GE��:��Aб��=����>��U�.��V��>��D��>��1��O��I��SÄN��9��8�9��F��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>)'��U�.��>��D��P��6��:��,Ȼ;��T��=��.��L#!��U�.��>��D��6��:��,Ȼ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��> ߹-��1��U�.��9��T��D��S�D��A ߹-��1��U�.��9��T��D��S�D��A�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>GE��3Ԛ<��U�.��D��Pڶ>��9��V��C��=��6��R��M��K��
-��P��T��6��1��T��P��653��1��U�.��D��9��V��C��6��R��K��
-��P��T��6��1��TڀP�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>;9߹-��U�.��:��S��M��?��B��;��BɵO��M��S��B��#��*��*��.��T53߹-��U�.��:��S��M��?��B��;��BֵO��S��B��#��*��*��.�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S&$��F��M��G��M��M��>��.��3ˠS�8��7��T��F��M��G��M��M��>��.�8��7�	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��S)'��V��X��?��A��MP��S��>��S��M�8��G��J#!��V��X��?��A��MP��S��>��SٶM��1�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S/-��U��=��Sб�.��6��5��J��?��O��4ʄ/��&�8��7&$��U��=��Sб�.��6�J��O��4ʄ/��&��8�	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��SVT��7��0�:��7�K���U�6��A�8��>��C������¾9�8��T��P��7P��X��>¾9�8��7��;><�K���U�6��A��>��C���¾9��8��P��7P��X��>¾9��8��;�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��SA?��U��=��Sб����7̛<�8��7��E��7��C��7����7̛<��(��������!53��U��=��Sб����7̛<�8��7��E��7��C��7����7̛<���	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��S86��W��7��I��U�8��>��S��E��Sٟ@�M߫U��@��U��'��@��@��@20��W��7��I��U��>��S��E��Sٟ@�M߫U��@��U��'��@��@�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S><�����������������
-��4��J��6����N��L�F��;��8��T��786�����������������
-��4��J��6����N��L�F��8��7�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.;9��/�,��<��7��F����N��C��N��:��Q��E��I��/��4�O�5��.��L,*��/��<��7��F����N��N��:��+��@��4�O�5��L�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.b`��/��Q��E��T����N��/��Q��E��V��K��/��Q��E��L��9��O��/Լ=��E��T��/��Q��E��/��4��/��V��Q��E��1��WJH��/��+��T����N��/��+��V��/��+��L��9��O��/�=��T��/��+��/��/��V��7��1��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.&$����N��C��N��/��Q��E��L��<��W��Q��T����N��N��/��+��L��<��W��Q�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.86����N��C��N��/�,��7ΩWǔ:��Q��B��1�5�O��.��Q��E��T,*����N��N��/��7ǔ:��Q��B��1�5�O��.��+��T�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.������N��/��Q��E��/��4��/��Q��/��4��K��/��Q��E��/��4��H��/��Q��E��Q��W��J��/��Q��E��E��/��V��/��Q��/��4��8��/��Q�O��4��/��Q��E��/��/��Q��/��K����-qo����N��/��+��/��/��Q��/��K��/��+��/��H��/��+��Q��W��/��+��E��V��/��Q��/��8��/��6��4��/��7��/��/��Q��/��K����-�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��./-����N��/��X�O��6��E��/��4�O�5��L��Q��E��@/-����N��/��X�O��6��E��/��4�O�5��L��Q��E��@�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.;9��2����N��Q��E��/�O��O�5��2��Q�O��K��2��Q��O��K��"��W53��2����N��+��/�O��O�5��2��6��K��2��Q��O��K��"��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.�~����N��/�,��K�O��6��1ؙD��T�5�O��L��/��Q��:��L��E��G��:��9��/��4��E��.��E��J��Q��E��T����N��D��Q��E��N��K��7��9��1��W��Tec����N��/��K�O��6��1ؙD��T�5�O��L��/��Q��:��E��:��/��E��.��E��J��+��T����N۳9��E��N��7��9��1��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.&$����N��/��1��Q��I��/��4�O��.��D��Q ����N��/��1��Q��@��4�O��.۳9�	��@��U��E	��@��U��E��U��E��T��@��?	��U��E��@	��@��U��E	��@��U��E ��@��U��F��5��E�9��:��U��@Ԛ<��@��U��F��5��E��:��U��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>)'��U��C��9��S��;ٟ@��>��6��E��6��>��@Ԛ<)'��U��C��9��S��;ٟ@��>��6��E��6��>��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>&$��V��X��,��)��E��Bٟ@��&��EϜV��Q��T��V��X��)��E��@��&��EϜV��Q���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>DB��DԚ<��(��������!ٟ@��6��E��S��>��)����%������"��6��"����&)'��D��5��E��S��>��)����%������"��6��"���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>#!��;ښL��)��E��6��?����?��O��K��T��;ښL��)��E��6����?��A��T���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>&$��8��V��1��)ٟ@��>��6��E��6��>��@Ԛ<#!��8��1��)ٟ@��>��6��E��6��>��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>53��K��:��S��;ٟ@��Sٟ@��>��6��E��6��6��>��G��A�7��B/-��K��:��S��;ٟ@��Sٟ@��>��6��E��6��6��>��G��+���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>\Z��$��B��)��K��F��E��6��>��R��D�I��6��P��G��H��>��R��5��K��9��>��6��6��;��N��D��S��PԮK߀3VT��$��B��)��K��F��E��6��>��R��D�I��6��P��G��H��>��R��K��9��>��6��6��;��N��D��S��PٮK���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>/-��RPٟ@��)����%ٟ@��6��E��6��>��D��P�D��A&$��RPٟ@��)��5��E��6��>��D��P�D��A���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>GE��!��Rٟ@��6��E��6��>ٟ@��щQ��K��B��)��B��$����&��9��U��>щQ��@Ԛ<;9��!��R��5��E��6��>ٟ@��щQ��K��B��)��B����U��>щQ��@Ԛ<���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$hf��$��>��I��?��9��T��W��O��$�8��$��>��I��?��9��T��W��O��$�8��Q��$��>��I��?��9��T��W��O��$�8��,��9��PMK��$��>ɞ9��W��O��$�8��$��>ɞ9��W��O��$�8��Q��$��>ɞ9��W��O��$�8��,��9��P���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'��֥>��$�8��?��9��T��W�8��Q��H��.��T#!��֥>��$�8ɞ9��W�8��Q��H��.��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'��U��"����҈����$��4��T޲F��?��9��T��U��"����4��T޲Fɞ9���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$tr�L��:��V��1��T��>��B��;��W�8������׫B�!��U��H��?��I��?��9��T��$�8��C��W��O��?��9��8��W�8ɳQ��W��Q��B��H��O_]�L��:��V��1��T��>��;��W�8������׫B�!��U��H��?��Iɞ9��$�8��C��W��OǞ9��W�8ɳQ��WвQ��H���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$;9µ��$��?��9��Tµ��$��?��9��T�@��M��@��>��K��T��@�/��Bɞ9ɞ9�@ܱM��>��K��@�/���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$86��<��?��9��T��W�8��$��C��Q��-����Q�;ۓR��T��C��G�0/-��<ɞ9��W�8��$��C��Q��-����Q�;ۓR��C��G�0���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$��?��9��T��$�8��:ɞ9��$�8��:���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$86��?��9��T��$�8��?��9��T��$�8��,��?��9��T��$�8�/��P&$ɞ9��$�8ɞ9��$�8��,ɞ9��$�8�/��P���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$nl��4��T޲F��?��9��T��"����҈����$��A�/��B��4��T޲F��?��9��T��"����҈����$��Q��8ޚT��N��G��K��T��O��T><��4��T޲Fɞ9��"����A�/��4��T޲Fɞ9��"����Q��8��+��K��T��O���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$&$��?��9��T��Q��0��"ǉ:��?��9��TɳQ��Qɞ9��Q��0��"ǉ:ɞ9ɳQ��Q���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$ec����?��9��T��8�8��I��?��9��T��$�8��W��O��8�8��8��O��?��9�8��Q��D��2��C��I��0�C��9��8��>ŒA��TSQ��ɞ9��8�8��Iɞ9��$�8��W��O��8�8��8��O��?��9�8��Q��D��2��C��0�C��9��>ŒA��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$20��?��9��T��$�8ܞN��D֥>��W��8ݶ;��U��W��8��9��T#!ɞ9��$�8�N֥>��W΀8��U��8��9��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$,*��"����҈����$��4��T޲F��?��9��Tǉ:��"��"����4��T޲Fɞ9ǉ:��"���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$53��"����҈����$��4��T޲F��?��9��T��Q��D��2��D��T#!��"����4��T޲Fɞ9��Q��D��2��D��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$A?µ��$��?��9��T��A��=��U����L����E��Q��?Ǳ.��<��?��9��T��C��9/-ɞ9��A��=��U����L����E��Q��?Ǳ.��<ɞ9��C��9���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'����?��9��T��$�8����1��Ƨ!��"��K��0 ��ɞ9��$�8����1��'��"��K��0���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$_]��W�8��?��9��T��W�8��"��?��9��T��W�8��$���)��?��9��T��W�8��5��6��U��C��7��C��R��?��7��?A?��W�8ɞ9��W�8��"ɞ9��W�8ɞ9��W�8��5��6��U��C��C��R��?��7��?���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$GE��"ǉ:��?��9��T��W�8��5��6��U��$��Ȓ ��������Ƨ!��G��8��O��<��T,*��"ǉ:ɞ9��W�8��5��6��U����G��8��O��<��T���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աOMK��/��N��/��4ʅ>߰4�>���N��.��O��X��,��F��J��O��:��9��/��N��/��4��@��@��@,*��N��4���N��O��X��,��F��O��:��9��N��@��@���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO><��/��N��/��4ʅ>߰4�>�N��.��Xҥ3߫U��B��W��O��F��J��U��Q��J&$��N��4�N��X��U��B��W��O��F��U��Q��J���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO����6��Mӛ?��6��Mӛ?��O�;��O��/��N��/��4��7��>��6��7��4��4��B����9��H��S����1����HŞ��1���K��į?��D��ߋ5����G��ބ2��4����P��K��ۥN��ɿC�R����S����2ބ2��B��@����Bބ2ͩ-��	ۥN������B�D��B���M��/����N��7����<��Q��B������J��7����1��R��6��Mӛ?گD��4��ɭ4��:��N��7����1������ӛ?��9��:������6��Mӛ?��6��Mӛ?աO��O��N��̻4��B����9��S����1����HŞ��1���K��į?��D��ߋ5����G��ބ2��4����P��K��ۥN��ۿC����S����2ބ2��B��@����B��D��	ۥN������B�D��B���M��/����N��7����<��Q��B������J��7����1��R��6��Mӛ?گD��4��ɭ4��:��N��7����1������ӛ?��9��:�����/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO86��/��N��/��4ʅ>߰4�>�N��.��X��W��B��O��F��J��U��Q��J#!��N��4�N��X��W��B��O��F��U��Q��J���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աOSQ��4��/��N��/��4��5��F��>��J��F��J��Iݩ5��O��Rܠ9��4��/��N��/��4��5��F��>��J��F��J,*��4��N��5��4��F��Iݩ5��O��M��4��N��5��4��F���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO ��0��0��,��B��4��.��Iַ;��@��?��0��,��B��4��.��I��@���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO&$����9��:��9��;��2P��X��>��9��:��;#!����9��:��9��;P��X��>��9��:��;���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P)'��7��6��B��J��P��T��;��<̖@��@��T��M��L&$��7��B��J��P��T��;��<̖@��@��T��M��L���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P,*��C�F��7��6��B��JԿ7��;˨O��O��/��J��Iַ;&$��C�F��7��B��JԿ7��;˨O��O��/��J��I���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P��P��H��I��L��2��C����O��JP��H��I��L��2��C����O��JسS��B��6��B��J��Dʿ7��E��>��PP��H��I��L��2��C����O��JP��H��I��L��2��C����O��J��Q��0��N��>��>��K��J��N����B��I��L��2��C����O��J��B��I��L��2��C����O��J۳S��6��B��JϿ7��E��>��P��B��I��L��2��C����O��J��B��I��L��2��C����O��J��Q��0��N׎>��KɏJ���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P,*��J��R��J�C��J��D��6��P��V��.��6��;��J��T)'��J��R��J�C��J��D��6��P��V��.��6��;��T���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P)'��A��R��J��B��J��D��6��PщQ��U��;�7��P&$��A��R��J��B��J��D��6��PщQ��U��;��7���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P53��BܥN��F��C��S��7��B��7��6��B��R��6��H��J��>��A��P/-��BܥN��F��C��S��7��B��7��B��7��H��J��>��A��P���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��PYW��Jǭ;��N��,��6��>��P��Jǭ;��DƂGщQ��Jǭ;��D�@щQ��,��6��>��G��3��.ٟ@��DƂGщQ��@Ԛ<SQ��Jǭ;��N��7��>��P��Jǭ;��DƂGщQ��Jǭ;��D�@щQ��7��>��G��3��.ٟ@��DƂGщQ��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P/-��F��Jō/��N��J��D��0��P��L�3��6��>��;��G��B&$��J��N��J��D��0��P��L�3��6��>��;��G���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P#!��6ǭ;��>��Q��6��N��J��>��P��;�7 ��6��>��Q��6��N��J��>��P��;�7���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P��C�F��J��B��6ǭ;��@Ԛ<��C�F��J��B��6��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P ��6��B��J��D��6��E��>��P��@Ԛ< ��6��B��J��D��6��E��>��P��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P ��7��6��B��J��>��P��Hڶ>��@Ԛ<��7��B��J��>��P��Hڶ>��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P ��Lǭ;��BϨH��J��>��P��A�7��B��L��BϨH��J��>��P��+���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P#!��Lǭ;��BϨH��J��>��P��D��G��@��K��L��BϨH��J��>��P��D��@��K���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P��;��-��M��=��;��-��M��=���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P��;��-��M��=��;��-��M��=���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P/-ϨH��Jō/��B��J��>��PϨH��Jō/��B��Jڶ>��F��=/-ϨH��Jō/��B��J��>��PϨH��Jō/��B��Jڶ>��F��=���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P20��7��6��B��C��J��>��P��/��G��=��Q��>��B��D��>ÐW,*��7��B��C��J��>��P��G��=��Q��>��B��D��>ÐW���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P ��Lǭ;��BϨH��J��>��P��:ÐW��4��L��BϨH��J��>��P��:ÐW��4���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��> ��<��6��>��7��T��<��@��9��:��T��<��6��>��7��T��?��9��:���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��> ��M��4��T��C��T��7��@��<��@Ԛ<��M��4��C��T��7��@��@Ԛ<���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>53��9��T��B��@��>��T�K��7��<��:��7��@��<ǭ;��?��A��B,*��9��T��B��>��T�K��7��<��:��7��@ՄN��A��B���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>A?��>��T��<��@��>��/��2��6��S��C��S��E��T��<��@��>��-��/��7��B��6;9��>��T��?��>��/��2��6��S��C��S��E��T��?��>��-��/��7��B��6���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>/-��T��@��<��T��@��<��/��T��@��<��6��S��E��A��T&$��T��@��T��@��/��T��@��6��S��E��A��T���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>,*��4��T��<��@��H��A��V��T��J��D��8��D��A��P��4��?��H��A��V��T��D��8��A���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>��4��T��<�G��D��G��@��K��4��<�G��D��@��K���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>;9��4��T��R��F��7��@��<��5��@��2��D��0��O����6��P����6��T,*��4��R��I��@��5��@��2��0��O����6��P����6���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>,*��C��>��8��T��<��7��@��<��1��>��D��P�D��A&$��C��>��8��<��7��@��1��>��D��P�D��A���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��686��6��N��B��U��C��-�9Ԛ<��D��/щQ��6��C��U��,��C��<��P)'��@��U��-��1��D��/щQ��6��U��,��C��<��P���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6#!��6��N��B��E��E��U�DщQ��C�D��P��@��E��E��U�DщQ��C�D��P���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��@Ԛ<��@��U��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6PN��6��N��B��U��Cٟ@�9ٟ@��N��D��.��B��2�I�O��=��.��@��D��N��B��2��B��E��1�S;9��@��U��9��N��D��.��2�I�O��=��.��@��D��N��2��B��E��1�S���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��6&$��6��N��B��U��MP�C��.��6��D��@Ԛ< ��@��U��MP�C��.��6��D��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6&$��6��N��B��U��Uӛ?��C��D��T��D�A��4��@��U��Uӛ?��C��D��T��D��A���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��620��6��N��B��U��C��-�9Ԛ<��6�Oݠ.��D����N��@Ԛ<#!��@��U��-��1��6ݠ.��D����N��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6/-��6��N��B��6��O��U��C��N��3��>��E��T��B��E��T&$��@��6��O��C��N��3��>��E��T��B��E��T�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9/-��D��H��L��K��D?�O��A��O��6�:��,��A�7��B)'��D��H��L��K��D?�O��A��O��6�:��,��+�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��986��H��:��!��DƇ>�O��-��8�W��H�O��W��K��-��4��=�R��J53��H��:��!��DƇ>�O��-��8�W��H�O��W��K��-��4�R��J�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��986��N��A��9��=��H��5��D?�O��Jٟ@��6�:��G��2��@��@��@,*��N��9��H��5��D?�O��J��5�:��G��2��@��@�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9SQ��Dǭ;��D��Q��D�U��H��:��D�O��>��&��D�B��7��D�O��O��J��D��I��P��A��F��E��>��6MK��Dǭ;��D��Q��D�U��H��:��D�O��>��&��D�B��7��D�O��O��J��D��P��A��F��>��6�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9&$��H��=��Dć?�O��=�9��=ϷA��H����@��H��=��Dć?�O��9��A����@�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9A?��,�O��-��H��D��Bٟ@��;��?��=��1��P��K��@?�O��=�9��=��@Ԛ<53��,�O��-��H��D��@�?��=��1��P��K��@?�O��9��@Ԛ<�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9JH��H��=��W��K��=��:��B��:��D?�O��:��D��1��=��@�9��=��D��9��D��5��@Ԛ<><��H��=��W��K��=��:��B��:��D?�O��:��1��=��@��=��9��5��@Ԛ<�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9&$��H��=��Dć?�O��=�9��=��D��S�D��A ��H��=��Dć?�O��9��D��S�D��A�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9><��D��H��K��L��9�GϪJ��D?�O��=��D�?��/ٟ@��=��@��6��@Ԛ<;9��D��H��K��LݲLϪJ��D?�O��=��D�?��/ٟ@��=��@��6��@Ԛ<�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��120��H��,��8��2�,ֈ;��0��4��V��C��7��G��/��T��>��1)'��H��,��8��2ڈ;��4��V��Cî7��/��T��>��1�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1,*��V��@��,��1��V��2�,��7��C��7��G��.��V��@ �M��,��1��V��2��7��Cî7��.�M�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��153��H��8��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>,*��H��8��2��7��Cî7��/��T��>��1��?��T��J��Q�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1/-��W��?��A��;��O��V��2�,��7��C��7��G��A��.��T#!��W��?ҞM��O��V��2��7��Cî7��A��.�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��2�,��>��B��-��4��5��J��2��>��B��-��5�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1hf��H��8�
-��N��2�,ԓ4��D��C��7��G��7��7��B��K��;��9��/��T��>��1��K��L��/��U��5�
-��5����>��2�,��W��FVT��H��8�
-��N��2��4��Cî7��7��B��K��;��9��/��T��>��1��K��L��/��U��5�
-��5����>��2��W�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��186��H��,��7��H��8��2�,��R��N��V��C��7��G��/��T��7��>��1��H��,��>��1�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��2��C��2��C�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��V��2�,��7��C��7��G¶;��V��2��7��Cî7¶;���B��R��A��D��K��B��R��A��DDB��R��A��D��K��C��5P��=Pބ2��R��A��D��K��C��5��8�:��-��9��E��T86��R��A��D��C��5��=܉2��R��A��D��C��5��8�:��-��9��E��T��B��R��A��D��K��B��R��A��D><��@��G��D��5��ՂP��R��A��5��H��D��KϲL��K��2��!��Q��H�9��T;9��@��G��D��5��ՂP��R��A��5��H��DϲL��K��2��!��Q��H�9��T���B��R��A��D��K��B��R��A��DJH��H���5ՂP��2��C��D��K��L��A��R��A��K��3��D��K��M��K��5��D��Kև9��>��TA?��H���5ՂP��2��R��K��L��A��R��A��K��3��D��M��K��5��D��Kև9��>��B��R��A��D��K��B��R��A��D,*�F�7��C��P��L߫W��A��=��R��A��D��K��S��7)'�F�7��C��P��L߫W��A��=��R��A��D��S��7���B��R��A��D��K��B��R��A��DPNՂP��L��E��;ߏG��K��C��R��A��D��K��C��B��A��M��K��C��K�?��K��C��<��O��Sߋ5��,MKՂP��L��E��;ߏG��K��C��R��A��D��C��B��A��M��K��C��K�?��K��C��<��O��Sߋ5��,��B��R��A��D��K��B��R��A��DGE��,��9��;��D��K��1��?؇9��U��8ȴS��>��C��P��D��7��L��R��A��B��A��D��K><��,��9��;��D�R؇9��U��8ȴS��>��C��P��D��7��L��R��A��B��A��D���B��R��A��D��K��B��R��A��DDB��R��A��D��K��C��5P��=Pބ2��R��A��D��K��C��5��8�:��-��9��E��T86��R��A��D��C��5��=܉2��R��A��D��C��5��8�:��-��9��E��T��B��R��A��D��K��B��R��A��D\Z��:��D��K��C��L��C��B��C��L��5��?��L��F��L��>��H��D��K��C��R��A��K�?��M��KߏG��K��C��BùFPN��:��D��C��L��C��B��C��5��?��L��L��>��H��D��C��R��A��K�?��M��KߏG��K��C��BùF���B��R��A��D��K��B��R��A��DJH��H���5ՂP��2��C��D��K��L��A��R��A��K��3��D��K��M��K��5��D��Kև9��>��TA?��H���5ՂP��2��R��K��L��A��R��A��K��3��D��M��K��5��D��Kև9��>��B��R��A��D��K��B��R��A��Dqo�
-��2��C��D��KՂP��L����A��R��A��K��3��D��K��M��KߏG��K����HӒC��,��N��D��K��5��=��T��Uߋ5��,��,��=��>��:��J_]�
-��2��R��KՂP��L����A��R��A��K��3��D��M��KߏG��K����HӒC��,��D��5��=��T��Uߋ5��,��=��>��:���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�ѤI��;��;��K��F��A��BѤI��;��;��K��F��A��B���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;� ѤI��;��S��>��>ٟ@��6��;��@Ԛ<ѤI��;��S��>��>��5��;��@Ԛ<���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�JH��3Ԛ<ѤI��;��>��6��;��6��S��F����;�.��T��T��D��<��D�<��D��C��)�.��FDB��1ѤI��;��>��6��;��6��S��F����;�.��T��D��<��D�<��D��C��)�.��F���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�PNѤI��;��A�9ٟ@��6��-�9��A��4��>��T�M��SѤI��2��2��E��7��>��>��2��O��D��@��TDBѤI��;��A @��6��9��>��T�M��SѤI��2��2��E��7��>��>��2��O��D��@��T���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�><ѤI��;��B��2ѤI��;��2ѤI��;��0ѤI��;��S��NѤI��;��N��O��F��T><ѤI��;��B��2ѤI��;��2ѤI��;��0ѤI��;��S��NѤI��;��N��O��F��T���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�53ѤI��;��>��>��;��U�0��>��;��D��6��P��G��D��S�D��A53ѤI��;��>��>��;��U�0��>��;��D��6��P��G��D��S�D��A���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�ѤI��;��M��@��KѤI��;��M��@��K���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;� ѤI��;��8ٟ@��>��6��C��A�7��BѤI��;��8ٟ@��>��6��C��+�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��9��E��N��3��9��>ڹ3��T��2��1��M����1��T)'��9��E��N��3��9��>ڹ3��T��2��M����1��T&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��:��B�7��>��B��3��1��C��T��C��CԃP��-��C#!��:��B�7��>��B��3��1��C��CƠ<��C�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53�1��TН?��>��/��3��>��N��D��3��>��2��H��T��C��.��:)'�1��?��/��3��N��D��3��>��2��H��C��.��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G/-Ԋ/��B��N��P��9��2��K��1��W��>��2Ԋ/��Lؒ.��=#!Ԋ/��N��9��K��1��W��>��2��Lؒ.��=�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��V��>��N��Dͯ?ڹ3�F��1ȇN��;�9��F��G�B)'��V��>��N��Dͯ?ڹ3�F��1ׇN�9��F��G�B&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��N��W��>ڹ3��2��1��%��K��9��E��?��A��F��F��?��D�J��EʡH��9��?�/86��N��W��>ڹ3��2��%��K��9��E��?��A��F��F��D�J��E��9�/�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��;�@�7��:��T��C��U��D��Tڹ3��>��NщQ��A�7��B#!��;�7��:��C��U��Dڹ3��>��NщQ��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G)'��A��F��F��?��9��E��1ڹ3��2��1��K��W��(#!��A��F��F��9��E��1ڹ3��2��K��W��(�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53��E��E��O��3��5��B��5��Iٟ@�7��A��:��5��Gς1��6��T&$��E��>��3��B��5��@�7��A��:��5��G��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G86��>��>��8��R��V��G��Bڹ3��;��2��F��5��>��H�K��7��<��653��>��>��8��R��V��G��Bڹ3��;��2��F��5��>��H��7��<��6�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GA?��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��J��D��8��D��A��P86��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��D��8��A&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��D��N��A��D��V��9��3��>��R��9��B�>��:��D��:��T��C��S��-��@��@��@86��D��A��D��V��9��3��>��R��9��B�>��:��D��:��C��S��@��@�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G��J��B�7��>��J��3��/��:��J�7��>��J��3��/��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��A��F��?��9��C��1��N��Wڹ3��2��1��K��:��&�8��7)'��A��F��9��C��1��N��Wڹ3��2��K��:��&��8�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��9��E��N��3��9��>ڹ3��T��2��1��M����1��T)'��9��E��N��3��9��>ڹ3��T��2��M����1��T&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GSQ��9��D��B��B��3ҾW��1��9��6��5��9��D��:��Q��T��C��2ʶU��>��3�.ٟ@��6ǽ=��G��@Ԛ<A?��D��B��B��3ҾW��1��9��6��5ՔD��Q��C��2ʶU��>��3ٟ@��6��G��@Ԛ<�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53�1��TН?��>��/��3��>��N��D��3��>��2��H��T��C��.��:)'�1��?��/��3��N��D��3��>��2��H��C��.��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��5��3��>��R��D��>��B�7��H��L��T��D��>��B�7��:��L��G��D��S�D��ADB��5��3��>��R��D��>��B�7��H��L��T��D��>��B�7��:��L��G��D��S�D��A�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��V��>��N��Dͯ?ڹ3�F��1ȇN��;�9��F��G�B)'��V��>��N��Dͯ?ڹ3�F��1ׇN�9��F��G�B&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��5�D��Bڹ3�G��>��<��9�7��>�?��L��S�:)'��5�D��Bڹ3�G��>��<��9�7��>��F��S�:�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��;�@�7��:��T��C��U��D��Tڹ3��>��NщQ��A�7��B#!��;�7��:��C��U��Dڹ3��>��NщQ��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��Gki��R��V��>��N��Dڹ3��2��į?��1��T�9��Fܫ7�M��N��6��K��9��D��,��K��6�9��.��1��R��3�R��F����B��O��B��Tec��/��>��N��Dڹ3��2��į?��1��T�9��Fܫ7�M��N��6��K��9��D��,��K��6�9��.��1��R��3�R��F����B��O��B�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53��E��E��O��3��5��B��5��Iٟ@�7��A��:��5��Gς1��6��T&$��E��>��3��B��5��@�7��A��:��5��G��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GSQ��6��3��P��7��S��4��D��T�9��I��8��7��1�Dڹ3��2��:��T��C��RٍB��KЅJ��C��G��>��6DB��6��3��P��7��S��C��T��8��7��1�Dڹ3��2��:��C��RٍB��KЅJ��C��>��6�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GA?��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��J��D��8��D��A��P86��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��D��8��A&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G&$��>��>��8��R��V��G��Bڹ3��;��2��F��5&$��>��>��8��R��V��G��Bڹ3��;��2��F��5�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��886يR��2��A��8��>يR��2��A��8��>��8��J��-��I��N��=��J��T)'يR��2ŞيR��2Ş��8��J��-��I��=��J��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8_]��>��7��JЁH��?��ʡH��W��O��U��A��7��J��1��H����N����=���F���F��HيR��2��8��>��G��@��K\Z��>��7��JЁH��?��ʡH��W��O��U��A��7��J��1��H����N����=���F���F��HيR��2��8��G��@��K�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8zx��(����"���������
-��U��A��7��J��1��H�
-��N����=���F���F��HيR��2��8��>��D��A��7��J��U��A��7��<��J��:��B��;��#_]�
-��U��A��7��J��1��H�
-��N����=���F���F��HيR��2��8��D��A��7��J��U��A��7��<��:��B��;��#�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8><يR��J��TيR��8��T��يR��<��J����N��	��=يR��J��-����8��T;9يR��J��TيR��8��T��يR��<����N��	��=يR��J��-����8��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��886يR��2��A��8��>يR��2��A��8��>��8��J��-��W��N����8��T,*يR��2ŞيR��2Ş��8��J��-��W��N����8��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8VT�
-��U��A��7��J��1��H����=���F���F��HيR��2��8��>�1��0�7����A��@��H۰M�3��AMK�
-��U��A��7��J��1��H����=���F���F��HيR��2��8�1��0�7����@��H�3��A�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8;9يR��2��8��>��9��K��A��8��D��6��P��>��JщQ��N��.��6��@Ԛ<53يR��2��8��9��K��A��8��D��P��>��JщQ��N��.��6��@Ԛ<�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8b`����U��A��7��J��1��H����N����=б�F���F��HيR��2��8��>��D��A��7��J��U��A��7��<��J��:��B��;\Z����U��A��7��J��1��H����N����=б�F���F��HيR��2��8��D��A��7��J��U��A��7��<��:��B��;���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD��/��U��P��G��,��N��K��Q��M��/��U��P��G��,��K��Q��M���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD ��/�J��P��C��9�8��?��U��P��T��/�J��P��9�8��?��U��P��T���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕDA?��/��/��P��/��/��P��OP��=��-��/��/��P��C�?��K��P��/��/��Pĩ8><��/��/��P��/��/��P��O��=��-��/��/��P��C�?��K��P��/��/��Pĩ8���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD><��N��R��9��K��U��P��S��>��9��S�9Ԛ<��/��D��9��D��R��K��@Ԛ<53��N��R��9��K��U��P��S��>��9��S��1��/��9��R��K��@Ԛ<���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD20��5��9��P��/ַ;��/��P��/��P��A��/��P��?��P��F��7,*��5��9��P��/ַ;��/��P��/��P��/��P��?��P��F���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕDPN��U��P�?İU��H��P��.��F��-��S��5��1��S�S��A��P��K�8��5��G��6�����)��ʪJH��U��P�?İU��H��1��F��-��S��5��1��S�S��A��P��K�8��G��6�����)��ʪ���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD ��U��PʡH��9��8��C��C��H��/��T��U��P��9��C��C��H��/��T���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD��PʰD��/��Fַ;��PʰD��/��1�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WGE��>ܷT��1��8��W��>��/��2��6��S��C��S��EܷT��1��8��W��>��-��/��7��B��6A?��>��U��8��W��>��/��2��6��S��C��S��E��U��8��W��>��-��/��7��B��6�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W��ܷT��1��W��>��/��>��/��C��SܷT��1��W��>��/��U��SܷT��1��W��>��/ܷT��1��W��>��/��>�8��M����6��@��6��6��>ќ:��0��F��6�<��G��>��2��6��7��(����%����!����"�~��U��W��>��/��>��/��C��S��U��W��>��/��U��S��U��W��>��/��U��W��>��/��>�8��M����6��@��6��6��>ќ:��0��F��6�<��G��>��2��6��7���ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WGE��>ܷT��1��8��W��>��/��2��6��S��C��S��EܷT��1��8��W��>��-��/��7��B��6A?��>��U��8��W��>��/��2��6��S��C��S��E��U��8��W��>��-��/��7��B��6�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WA?��6��D��Q��6��6��NیV��O��H��2ܷT��1��8��W��/��Q��6��6��;��6��=;9��6��Q��6��6��NیV��O��H��2��U��8��W��/��Q��6��6��;��6��=�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W��8ܷT��1��O��W��=��;��8��U��O��W��=��;�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W ܷT��1��W��F��M��>Л6��;��@��K��U��W��F��>Л6��;��@�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W20��A��8��9�Q��EܷT��1��G��4��W��E��>��F��W��A��B)'��A��8ƋQ��E��U��G��4��W��E��>��F��A��B���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��XMK��:��?��:��?��L��I��M��W��#������D��E��=��X��<��F��#����#��%Ӳ&��Ӳ&��;9��:��?��:��?��L��M��W��#������D��E��=��XѶ<��#��#�Χ���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X ��W��R��:��?�9��Iʉ5��X��@Ԛ<��W��R��:��?��Iʉ5��X��@Ԛ<���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��XVT��9��W��I��>��:��?щQ��V��4��6��V��6��#����6��#��%��6��#����6��$����6��#�8���8GE��9��W��I��>��:��?щQ��V��6��V��6��#��6��#��6��#��6����6��#�8���8���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X�9��Wʉ5��X��@��N�9��Wʉ5��X��@��N���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X><��W��R��:��?�9��I��G��>ܤK��V��#����%ѾC��H��T��L��6��L��T53��W��R��:��?��I��G��>ܤK��V��#����%��5��L��6��L��T���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X;9��<��W��1��/��>��:��?б�����9��WڶU��5���P����R��T53��<��W��1��/��>��:��?���9��WڶU��5���P����R��T���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X53��E��W��N��6��=��A��9��S��:��?�9��I��>��<��G��V��620��E��W��N��6��=��A��9��S��:��?��I��>��<��G��V��6���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?��Gʉ5����B��W��/��U��X��7�A��E��3��D��#����&&$��:��?��G����B��W��/��U��X�A��E�����D��,��?��R��F��D��,��?��R��F;9��,��?��R��F��Q��U��B��D��A��P��;��0��T��?��6��T��)����!&$��,��?��R��F��Q��U��A��;��T��6��T��)��D��,��?��R��F��D��,��?��R��F/-��?��,��F��R��>��,��6��2ɀ?��E�B��P��2��2��>)'��8��F��R��>��,��6��2ɀ?��E�B��P��2��2���D��,��?��R��F��D��,��?��R��FMK��D��=��D��3��Dٟ@��F��R��?��,��1��@��?��>��1�9��Kٟ@�9ٟ@�-��4��,��@Ԛ<><��D��D��Rٟ@��F��R��8��1��@��?��>��1�9��K��9�-��4��,��@Ԛ<��D��,��?��R��F��D��,��?��R��F,*��R��F��>��BϨH��,��@��?��,��6��D��P�D��A&$��R��F��>��B؋8��@��8��6��D��P�D��A���D��,��?��R��F��D��,��?��R��F><��D��,��?��R��F��?ϨH��.��?��R��J��V��9��S��6��>��EщQ��@Ԛ<53��D��,�.��F��?��H�.��J��V��9��S��6��>��EщQ��@Ԛ<��D��,��?��R��F��D��,��?��R��F,*��R��F��,��?��H��A��V��T��J��D��8��D��A��P ��R��8��?��H��A��V��T��D��8��A���D��,��?��R��F��D��,��?��R��FGE��D��3��D����R�I��F��,��2��?��.����@��PیV��D��H��A��V��D��A��P��T><��D��R����R�I��F��,��2��?��.����@��PیV��D��H��A��V��A��T��D��,��?��R��F��D��,��?��R��F20��A��9��=��R��F��,��L��?��.��,��K��,��6щQ��@Ԛ<,*��9��R��F��,��L��?��.��,��K��,��6щQ��@Ԛ<���D��,��?��R��F��D��,��?��R��FA?��D��R��F��,íB��?��2��D��9��7��I��6��.��2��9��DܤK��4��@��@��@86��D��R��F��,íB��?��2��D��9��7��I��6��.��2�D��4��@��@��D��,��?��R��F��D��,��?��R��F��,��?��R��F��U��P��U��T��,�.��F��U��P��U���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I><��S��9��C��X��7ֈ?�N��X��Iַ;��W��N��I��,ڶ>��T��0��N��6�Q20��S��9��Xֈ?�N��X��I��W��N��,ڶ>��T��0��N��6�Q���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I,*��9��C��Xֈ?�N��X��Iַ;��M��/��TۓR��7��K ��9��Xֈ?�N��X��I��M��TۓR��7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I)'��9��C��Xֈ?�N��X��Iַ;��B�<ނB��<��T#!��9��Xֈ?�N��X��I��B�<ނB��<��T���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I ��:��-��9��C��Xֈ?�N��X��Iַ;��:��-��9��Xֈ?�N��X��I���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��Iec��/��S��B��X��D��/��S��:��X��D��<Υ6��1یV��0��/��S��B��X��U��B��O��B��E��B��V��B��,��B��-ނB��<��TMK��/��S��B��D��/��S�:��D��<Υ6��V��0��/��S��B��UüO��E��B��V��,��-ނB��<��T���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I,*��9��C��Xֈ?�N��X��Iַ;��M��/��TۓR��7��K ��9��Xֈ?�N��X��I��M��TۓR��7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I#!��9��C��FۨV��T��/��9��7ʡHб��6��9��F��8��/��9��7ʡHб��6���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I ��9��C��Xֈ?�N��X��Iַ;�7��4��9��Xֈ?�N��X��I�7���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��4ֈ?��Iַ;��6��B��T��9��4ֈ?��I��B��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�OPN��D��D��7��O��C��-��S�O֊2��>��S��DɵO��6��8�9��H��A��V��T��J��D��8��D��A��PA?��D��D��7��C��-��S�O��>��S��DɵO��6��8�9��H��A��V��T��D��8��A���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�OA?��7��8��B��Bر/��D��2ѺKٟ@��6��T��C��M��U�<��F������!����"20��7��8��B��Bر/��D��2ѺK��5��T��C��M��U�<��F��	���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O ��5��7��:��C��D�O֊2ѺK��@Ԛ<��5��7��:��C��D�OѺK��@Ԛ<���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�ODBкB��9��N��7��:��C�O֊2��>ٟ@��6߇;��1��G��3��F��7;Q��6��7;Q��T><кB��9��N��7��:��C�O��>��5߇;��1��G��3��F��7;Q��6��7;Q��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�ODB��7��O��B��6��2��>��R��@��2��A��5��7;QԚ<��7;Q��T��7��N�3��>��M><��7��O��B��6��2��>��R��@��2��A��5��7;QԚ<��7;Q��T��7��N��>���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O)'��V��D��D��7��B��C��9��2��>��/��6��7��T)'��V��D��D��7��B��C��9��2��>��/��6��7��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O&$��V��7�J��R��1��:��2��R��<��@��@��@#!��V��7�J��R��1��:��2��R��<��@��@���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M;9��2��8��=��S��0��M��2��8��G��N��0��6��W��,��6��4����6��T20ſ2��=��S��0��Mſ2��G��N��0��6��W��,��6��4����6���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8ٟ@��6�Q��>��D��1��.��S��6��M��>��=����6��P����6��T20ſ2��5�Q��>��D��1��.��5��M��>��=����6��P����6��2��8��>��1��S��6��Mſ2��>��1��5��M/-��2��8��>��M��2��8��1��S��6��MۓR��9��T��,��K&$ƿ2��>��Mƿ2��1��5��MۓR��9��T��,��K���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8��1��S��6��M��>��1�H��3��PیV��D��H��A��V��D��A��P��T20ſ2��1��5��M��>��1�H��3��PیV��D��H��A��V��A��T��2��8��>��1��S��6��Mſ2��>��1��5��M53��A��2��8��=χ7��1��S��6��M��N��1��SщQχ7��=�R��J,*��Aſ2��=χ7��1��5��M��N��1��SщQχ7�R��J���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M��S��6��M��E��S��2��8��@Ԛ<��5��M��E��Sſ2��@Ԛ<���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8ٟ@��6�Q��>��D��1��.��S��6��M��>��=����6��P����6��T20ſ2��5�Q��>��D��1��.��5��M��>��=����6��P����6��2��8��>��1��S��6��Mſ2��>��1��5��MYW��D��NԚ<��2��8��1��S��6��M��1�H��3��Vٟ@��2��8��1��D��A��P��;��0��T��?��6��T��)����!86��Nſ2��1��5��M��1�H��3��Vٟ@ſ2��1��A��;��T��6��T��)���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8��1��S��6��M��>��1�H��3��PیV��D��H��A��V��D��A��P��T20ſ2��1��5��M��>��1�H��3��PیV��D��H��A��V��A��T��2��8��>��1��S��6��Mſ2��>��1��5��M53��M��Vٟ@��2��8��D��A��P��;��0��T��?��6��T��)����!��Mٟ@ſ2��A��;��T��6��T��)���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M#!��A��2��8��1��S��6��M��D��S�D��A��Aƿ2��1��5��M��D��S�D��A�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��JA?��V��N��N��,̥6��:��D��9��S��J��6�O��Q��Nέ;��L�S��DʡH��9�;86��V��N��,̥6��:��D��9��S��J��6��Q��N٭;�S��DʡH��9�;�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��J53����DȂ3��@��>��QP��JP��J��>��R��C��R��A�7��B/-����DȂ3��@��>��QP��JP��J��>��R��C��R��+�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J;9��5��$��,��U��,������6��D��>��:��5��JЂJ��D��J��A�7��B/-��5��$��,��,Ѝ��6��D��>��:��5��JЂJ��D��J��+�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J��G��7��T��Q��-��G��7��T��Q��-�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��JDB�����������A��2ûR�9��?��A��>��;��B��TûR�9��?��A��>��5��653��T�A��2ûR�9��?��A��>��;��BûR�9��?��A��>��5��6�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��J><������6��J��D��9��S��J��6ȻW̑-�9ٟ@P��J��>��R��J��@Ԛ<86����6��J��D��9��S��J��6ȻW̑- @P��J��>��R��J��@Ԛ<�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��JMK����6��6�D��J��QP��L��>��J��R��J��J��QP��L��@��B��J��9��Uڤ5��5��@Ԛ<GE����6��6�D��J��QP��L��>��J��R��J��J��QP��L��@��B��J��Uܤ5��@Ԛ<�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J����4��B��4յG��W��G��X��F����4��B��4��W��X�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J,*������R��:��D��>ڝJ��R��K��2��D��G��@��K&$Ѝ��R��:��D��>ڝJ��R��K��2��D��@��K�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��J;9��5��$��,��U��,������6��D��>��:��5��JЂJ��9��W��W����C/-��5��$��,��,Ѝ��6��D��>��:��5��JЂJ��9āR��A�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��J�����A��F��8�,��TЍ�A��F��,��T�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��J	����4��B	����4��B�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J/-��U��C��U��TʡH��>��/��X��>����>�A��2���� ��U��UʡH��>��/��X����>��2Ѝ�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J�����A��F��P��C��<��B��BЍ�A��F��P��<��B��B�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��J&$����U��R��:��D��>��J����B����/��T&$����U��R��:��D��>��J����B����/��T���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB���2��<��;��>��2��<��;��>���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�)'��V����2��P��K��C�4��EȯB��-�;��J��6&$��V����2��P��K��C�4��EȯB��-�;ϜJ���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�20�T��3��=��C��;��D��9��>��:��C��O��-֛7��<��B��B,*�T��=��C��;��D��9��>��:��C��O��-��<��B��B���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�/-ȯB��K��C��;��9��;��L�V��6�����)��ʪ��,*ȯB��K��C��;��;��L�V��6�����)��ʪ�����P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�53��2��<��F��2��D��C��D��:��LܾW��X��F��H��F��N��I��9)'��2��<��,��D��:��LܾW��X��F��H��F��N��I���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�#!��H����D��C��,ȯB��J��P��I��@Ԛ<#!��H����D��C��,ȯB��J��P��I��@Ԛ<���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�,*��>����2��P��:ȯB��K��6��N�K��D��S�D��A,*��>����2��P��:ȯB��K��6��N�K��D��S�D��A���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�#!��P��N��ȯB��>��9��H��-�B�V��6#!��P��N��ȯB��>��9��H��-�B�V��6���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�JH����D��P��RȯB��I��H��,��5��6��:��L��I��B��,��I��;�9��V��;��K��XܤK��$GE����D��P��RȯB��I��H��,��5��6��:��L��I��B��,��I��;��V��;��K��XܤK��$�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9,*ȏBҲU��>��R��<��G��I��X��I��C��E��#��CҮJ��B��>ɸ<��I��X��I��C��#߭J�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����B߹-�;��:��XܷT��6˩5��J˩5��4����B߹-�;��:��XܷT��6˩5��/��7;9����B��-��:��XܷT��6��J˩5��4����B��-��:��XܷT��6��/��7�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9��6��T��'��߹-��X��6˩5��6��T��߹-��X��6�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9A?��Uٟ@����5߹-��:��X��D˩5��I��:��X��B��9��D˩5ƛK��6��@��@��@;9��Uٟ@����5߹-��:��X��D˩5��I��:��X��B��D˩5ƛK��6��@��@�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����B��I�;��:��XܷT��6˩5��J˩5��4����B��I�;��:��XܷT��6˩5��/��7A?����B��I�;��:��XܷT��6��J˩5��4����B��I�;��:��XܷT��6��/��7�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9;9��5˱U̾-��C��3��C��I��Q��:����>����:��X��>��6˩5��,��;86��5˱U̾-��C��3��C��I��Q��:����>����:��X��>��6��,��;�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9\Z����B��H����6ӻB��O��߹-��:��XܷT��B��H��߹-��XܷT��;��W��;����N����=��	�F��J˩5��4YW����B��H����6��O��߹-��:��XܷT��B��H��߹-��XܷT��;��W��;����N����=��	�F��J˩5��4�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9)'����>��6˩5��5�W�R�9��:��D��S�D��A ����>��6��5�Wֲ9��D��S�D��A�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9A?б��=��	�F߹-��=��X����B˩5��H��F��S��T��:��TʡH��?��CگD��/86б��=��	�F߹-��=��X����B��H��F��S��T��T��9��CگD��/�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9nl�R��A߹-��:��X��>����B��6˩5��1��D��0�;��Hٟ@�R�9��:��K��B��B��>�5��I��B��E��K�R��F��T��D�>��6��@Ԛ<_]�R��A߹-��:��X��>����B��6��1��0��Hٟ@ֲ9��K��B��B��>�5��I��B��E��K�R��F��T��D�>��6��@Ԛ<�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����K��B��6��N��E��I��:��X��5�R�9��:˩5��U�I�R��>��:��D��S�D��AA?����K��B��6��N��E��I��:��X��5ֲ9˩5��U�I�R��>��:��D��S�D��A�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9DB߹-��:��XܷT��6��H߹-��:��XܷT��6˩5��Q��'����Ѳ��B��6ӻB��O��453߹-��:��XܷT��6߹-��:��XܷT��6��Q����Ѳ��B��0��4�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9/-��0��:��X��6˩5��0��:��X��6˩5�>��4��6��4��T&$��0��:��X��6��0��:��X��6�>��4��6��T�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9A?б��=��	�F߹-��=��X����B˩5��H��F��S��T��:��TʡH��?��CگD��/86б��=��	�F߹-��=��X����B��H��F��S��T��T��9��CگD��/�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9\Z��D�R��A��9į?߹-��=��X��>��6˩5��H��0��-��D��E��0��6��EщQ��I��.��6щQ��2��2��D��S�D��AYW��D�R��A��9į?߹-��=��X��>��6��H��0��-��D��E��0��6��EщQ��I��.��6щQ��2��2��D��S�D��A�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��;��W��H��E��K��9ݠ.��E��T����N��;��W��H��E��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��;��W��H��E��K��K��9ݠ.��E��T����N��;��W��H��E��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��9ݠ.��;��W��K��E��	��9ݠ.��E��T ����N��9��;��W��K��	��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��;��W��K��E��K��9ݠ.��E��T����N��;��W��K��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��F��;��W��E��K��9ݠ.��E��T ����N��F��;��W��E��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��F��;��W��2��T��9��K��9ݠ.��E��T&$����N��F��;��W��2��T��9��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��F��;��W��H��K��9ݠ.��E��T ����N��F��;��W��H��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��9ݠ.��;��W��H��E��	��9ݠ.��E��T#!����N��9��;��W��H��E��	��9��E��T���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I/-��:��T��6��7��8��:��T��6��7��Iַ;P��=��8��-)'��:��T��6��7��8��:��T��6��7��I��=��8��-���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I ��T��1��8��:��C��T��6��7��Iַ;��T��1��8��:��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��Iܥ6��0��T��6��7ȣ8��Iַ;ܥ6��0��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I,*��:��0��E��U��P��U��,��I��:��T��6��7��Iַ;#!��:��0��8��P��,��I��:��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I��0��T��6��7ȣ8��Iַ;��0��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I��:��C��T��6��7��Iַ;��@��?��:��T��6��7��I��@���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I/-��:��C��T��6��7��Iַ;��:��C��T��6��7��Iַ;��;#!��:��T��6��7��I��:��T��6��7��I��;���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I,*��0��E��U��4��J��8��:��C��T��6��7��Iַ;ܥ6 ��0��8��4��J��:��T��6��7��Iܥ6���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I)'��:��C��T��6��7��Iַ;��M��/��TۓR��7��K��:��T��6��7��I��M��TۓR��7���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I)'��:��C��T��6��7��Iַ;��M��/��TۓR��7��K��:��T��6��7��I��M��TۓR��7���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I����P����P���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B�:��D��>�7��5��.��T�:��D��>�7��5��.��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��BMK��%��X��6��Xޡ8��X��S��X��8��X��N��X��.��X��C��X�C��X��F��X��2��X��4��X��CA?��%��X��6��Xޡ8��X��X��8��X��N��X��X��X��X��F��X��2��X��4��X��C���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��BSQİF��E��1��;��/��6��4��X۹/��>��O��X۹/��>��TʭB��S��>��OʭB��S��>��T��U��>��6��K53İF��B��/��6��X��>��O��X��>��B��>��O��B��>��U��>��6���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B��/��4��?����B��O��B��T��/��4��?����B��O��B���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B)'��/��4��3��>��L��?��?��H��F��?����F��T#!��/��4��3��>��L��?��H��F����F��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��BGE˛5��9��/��=��T��4��>��X��?ޡ8��R��V��4��>��E��1��;��6��T��4��4��K��2,*���-��4��>����/��4��>��B��6��T��5��K��2���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��BVT��D�G��:��/��4��X��>��3��?��X��?��F��B��T��F��?ޡ8��H��?��.��:��FʭB��.��4��?��F��6><��G��:��/��4��X��>��3��?��X��F��B��T��Fޡ8��H��.��F����F��6���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��BVT��E��1��;��>��C��6��P��K��H��,��-��X��?��7�1�E��7��0����NʡH����H��0��6��4��T��DB��B��>��C��6��P��K��H��,��-��X����0����NʡH����H��0��6��4��T�����4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B#!��U�/��4��X��>��3��B��?��8�,��T ��U�/��4��X��>��3��B��?��,��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��B/-��-��4��4��6��M��;����-��>��>��@��W��>��W��>)'��-��5��6��M��;����-��>��>��@��>��W��>���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��B53��?��4��1��K��>��F����7��>�>��D��<��(��6հL��3��T53��?��4��1��K��>��F����7��>�>��D��<��(��6հL��3��T���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B_]��E��1��;��1��-��X��?��P��@��4��B��S��?��H��-��M��>ԁ:�F��T��,��;��J��8��L�0��(��������!MK��B��1��-��X��?��P��@��4��B��S��?��H��-��M��>ԁ:�F��T��,��;��J��8��L�0��X���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B)'��/��4��3��?��>��L��H��J��X�/��E��N��B#!��/��4��3��?��>��L��H��J��X��E��N���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��B����3��H��2��4��C��M΄/ǟ9��=��Tޡ8��?������$����������������������ڻ��������������(����$�������������������!����3��H��2��4��C��Mτ/��-�8������$����������������������ڻ��������������(����$�������������������!���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��B,*��/��4��3��?��>��L��H��J��X�/��E�1ʞ:��-)'��/��4��3��?��>��L��H��J��X��E�1ʞ:��-���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B,*��H��E��1��;��>��/��4��H��?��L��B��<��B��B#!��H��B��>��/��4��H��L��B��<��B��B���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��9��G��O��B��Q��T��2��>����P��V��P��.��5��A��J��>��P)'��9��G��O��B��L��>����P��P�.��J��>��P���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FA?�C��O��W��>��M�>��B��W��A��Q��T��2��9��6��O��8��G��D��S�D��A;9�C��O��W��>��M�>��B��W��A��L��9��6��O��8��G��D��S�D��A���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FMK��R��9��G��M��W��W��A��I�>��B��N��=��=�9��=��A��>��M��N��S��9��=��A�7��B86��RךG��W��W��A��I�>��B��N��=��=��>��M��N��S��9��=��+��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��9��G�>��B��Q��T��2��>��V��J��7��6��8��T��7��=&$��9��G�>��B��L��>��VќJ��6��8��7��=���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F;9��D��9��D��I�>��B��W��R��Q��T��2��>��S��Q�U��>��V��@Ԛ<&$��9��I�>��B��W��R��L��S��>��V��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��
-��F��W��L����S��J������$����2����A��B/-��
-��F��W��L����S��J������$����2����A��B���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��D��S��8��G�>��B��W��A��Q��T��2��>��M��@��?��@��@��@/-��D��S��8��G�>��B��W��A��L��>��M��@��?��@��@��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��N��F��H��F��O��F��O��V��V��A��4��@��K&$��N��F��H��F��O��F��O��V��V��A��4��@���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F&$��W��>��V�>��B��W��A��Q��T��2��@Ԛ< ��W��>��V�>��B��W��A��L��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86�>��B��Q��T��2��>΂P��F��;�/��U����N��5��L��U��ٶ,*�>��B��L��>΂P��F��;�/��U����N��5��L��U���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F#!�>��B��W��B��Q��T��2��>��F��@Ԛ<�>��B��W��B��L��>��F��@Ԛ<���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F#!��W��2��E�>��D��Q��T��2ϩN��F��B��W��E�>��D��LϩN��F���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FMK��R��9��G��M��W��W��A��I�>��B��N��=��=�9��=��A��>��M��N��S��9��=��A�7��B86��RךG��W��W��A��I�>��B��N��=��=��>��M��N��S��9��=��+��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F��8��F��5��R��.��U��E��S��2��8��F��=��.��U��E��S���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F;9��D��9��D��I�>��B��W��R��Q��T��2��>��S��Q�U��>��V��@Ԛ<&$��9��I�>��B��W��R��L��S��>��V��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��J��>��R��8��"����������F��K��%��F��J��>��R������F��%��F���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��D��S��8��G�>��B��W��A��Q��T��2��>��M��@��?��@��@��@/-��D��S��8��G�>��B��W��A��L��>��M��@��?��@��@��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F53��D��9��6��M��E��K�>��B��Q��T��2��>��V��D��@��@��@&$��D��9��6��E��K�>��B��L��>��V��@��@���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F&$��W��>��V�>��B��W��A��Q��T��2��@Ԛ< ��W��>��V�>��B��W��A��L��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F ֖F��>��P��Mމ6��J��6��J����7��+��Mމ6��J��6��J����7���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��U�Mӛ?�1��?��7��F��,��7���M�����R��Q#!��U�Mӛ?�1��?���M�����R��Q���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��D��9��6��M�>��B��W��A��Q��T��2��D��S�D��A&$��D��9��6�>��B��W��A��L��D��S�D��A���I��D��T��0��I��I��D��0��IDB��D��T��3��0��I��D��T��3��0��I��4��D��T��3��0��IǱ.��>��4��I����?86��D��3��0��I��D��3��0��I��4��D��3��0��IǱ.��>��4��I��2��I��D��T��0��I��I��D��0��I,*��I��D��T��0��1�,��I��C��D��T��0��I��D��T#!��I��D��T��0��1��I��D��0��I��D��T���I��D��T��0��I��I��D��0��I&$��R��I��D��T��N��0��I��0��I��4��@Ԛ<��R��I��D��0��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��I/-��0��I��9��Q��6��S��=��K��I��8��K��I��:��@Ԛ</-��0��I��9��Q��6��S��=��K��I��8��K��I��:��@Ԛ<���I��D��T��0��I��I��D��0��I��0��I��5��I����?��0��I��5��I��2��I��D��T��0��I��I��D��0��IA?��D��T��3��I��5��8��D��T��3��8��I��5��D��T��3��X��5��8��I����?53��D��3��I��5��8��D��3��8��I��5��D��3��X��5��8��I��2���I��D��T��0��I��I��D��0��I��0��I��Iַ;��4��D��G��@��K��0��I��I��4��D��@��K��I��D��T��0��I��I��D��0��I&$��I��D��T��P��D��N��0��I��0��I��@Ԛ<��I��D��P��D��0��0��I��@Ԛ<���I��D��T��0��I��I��D��0��I#!��I��0��IػK��I��0��I��4��I��0��I#!��I��0��IػK��I��0��I��4��I��0��I��I��D��T��0��I��I��D��0��I/-��0��I��D��T��Fַ;��8��-����8��T��������!��0��D��1��8��-����8�����I��D��T��0��I��I��D��0��I��0��I��4��@Ԛ<��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��I20��D��0��I��D��0��I��4��D��0��IǱ.��>��4��I����?/-��D��0��I��D��0��I��4��D��0��IǱ.��>��4��I��2���I��D��T��0��I��I��D��0��I��0��IǱ.��>��4��@Ԛ<��0��IǱ.��>��4��@Ԛ<��I��D��T��0��I��I��D��0��I20��I��D��T��,�;��0��1�,��I��C��D��T��0��I��D��T)'��I��D��T��,�;��0��1��I��D��0��I��D��T���I��D��T��0��I��I��D��0��IDB��D��T��3��0��I��D��T��3��0��I��4��D��T��3��0��IǱ.��>��4��I����?86��D��3��0��I��D��3��0��I��4��D��3��0��IǱ.��>��4��I��2��I��D��T��0��I��I��D��0��IDB��,��TܷT��0��I��,��TܷT��0��I��4��,��TܷT��0��IǱ.��>��4��I����?86��,ܷT��0��I��,ܷT��0��I��4��,ܷT��0��IǱ.��>��4��I��2���I��D��T��0��I��I��D��0��I&$��R��I��D��T��N��0��I��0��I��4��@Ԛ<��R��I��D��0��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��IMK��D��T��3��N��0��I��D��T��3��N��0��I��4��D��T��3��N��0��IǱ.��>��4��I����?/-��D��3��0��D��3��0��4��D��3��0Ǳ.��>��4��I��2���I��D��T��0��I��I��D��0��I��0��I��5��I����?��0��I��5��I��2��I��D��T��0��I��I��D��0��I��0��IػK��4��@��K��0��IػK��4��@��K���I��D��T��0��I��I��D��0��I��0��I��Iַ;��4��D��G��@��K��0��I��I��4��D��@��K��I��D��T��0��I��I��D��0��I��0��I��D��G��@��K��0��I��D��@��K���I��D��T��0��I��I��D��0��I#!��I��0��IػK��I��0��I��4��I��0��I#!��I��0��IػK��I��0��I��4��I��0��I��I��D��T��0��I��I��D��0��I53��D��T��3��I��D��T��3��Iַ;��D��T��3�O�I��I����?#!��D��3��I��D��3��I��D��3�O��I��2��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8��L��7��B��6	�L��B��6��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8��L��7��@��K	�L��@��K��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8��L��7��6��?	�L��6��?��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�,*��6�L��7��8��>ٟ@��H��F��@��F��7��6��>��P)'��6�L��8��>ٟ@��H��F��@��F��7��6��>��P��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8�20��A��7�L��7�Hٟ@��8��E��P��;��:��P��O��@��@��@,*��A��7�L�Hٟ@��8��E��P��;��:��P��O��@��@��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�P��,�L��7��?��6��0���?��6��0��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8�/-�L��7ٟ@��8��A��R��>��:��6��>��N��D��S�D��A,*�Lٟ@��8��A��R��>��:��6��>��N��D��S�D��A��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�wu�L��7��D��F��6�L��7��B��7�L��7��6��<��6P��,�L��7��
-�Gٟ@��8��6��7��@��7��5�L��7��8��>ٟ@�;��F��J��>��N��1�S_]�L��D��F��6�L��B��7�L��6Ǥ<���
-�Gٟ@��8��6��7��@��7��5�L��8��>ٟ@�;��F��J��>��N��1�S�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��C��P��I��/��C��/��9��?��T#!�0��=����P��I��/��C��/��9��?��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S86¨0ʽ=��=��Tɾ=��C��6��=��Tɾ=��C��6��A�A��N��T��A��T#!�0��=��T�6��=��T�6��A��A��T��A�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��W����7��>��7��C��<��B��B)'�0��=��Tɾ=��W����7��>��7��C��<��B��B�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-��=��Tɾ=��>¨0ʽ=ʇX��Qޢ<�Qɾ=��Cݰ?��Q��.&$��=��Tɾ=��>�0ʇX��Qޢ<�Q̾=ݰ?��Q�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��J��6��O��T¨0��A��=��Tɾ=��J��6��K�0ҳ��O��T�0ҳ��K�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��SJH¨0ʽ=��PʇX��D�Q��=��Tɾ=��C��>ΉX˛5¨0��A��/��T��D¨0��A��/�A��4�J53�-ʇX��D�Q��=����>ΉX˛5�0��/��T��D�0��/��A�J�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S)'¨0��A��=��Tɾ=��W�9��L��/͒�A��4��T�0��=��Tɾ=��W��/͒�A��4�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��SSQ¨0ʽ=��P��N��=��Tɾ=��C��@��Eޢ<�Qɾ=��C��D��/��D��/��QİU��4��4����������/-�-��N��=����@��Eޢ<�Q̾=��D��/��D��9��4�*�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��6��=��T��3��O��T��D�A��4&$�0��=��Tɾ=��6��=��T��3��O��T��D��A�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S ¨0��A��=��Tɾ=��6����8�,��T�0��=��Tɾ=��6����,��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��SDB��S��4��8¨0ʽ=��P��=��Tɾ=��Cϛ)ϛ)�)�)�Q��Tɾ=��C��9��8��K��T/-ФO��8�-��=��ϛ)ϛ)�)�)�Q����9��8��K��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S#!��E��=¨0ʽ=��=��Tɾ=��.��8��?̛<��=�0��=��Tɾ=��.��?�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��S;9¨0��A��=��Tɾ=��C��E��S��S��.��PщQ¨0��A�A��4��D�A��T&$�0��=����E��S��*щQ�0��A��D�A��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S ¨0��A��D��>��=��Tɾ=��C��@��K�0��D��>��=����@��K�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S��=��Tɾ=��C��6¨0��A��T��=��T�6�0��T�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/)'��/��/Æ.��J��:��N��L��J��S�1��/��G��B ��/��/Æ.��J��:��N��L��W��/��G�	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/��S�1�D��?	��W�D��?�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/53��S�1��/��E��7��0��C��/��7��7����S�1��/����A��B#!��W��/��E����7����W��/����A��Bw	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/�A��B�A��B�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/20��9��J��/��?ſQ��5ߕJ��C��M��C��R��U��RН?��Q��T)'��9��J��/��?ſQ��5ߕJ��C��M��C��R����Q�	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/><��B��U��>��9��@��V��W��F�?��Wַ;��;�E��-��S�1Н?��>��A��T20��B��>��9��@��V��W��F�?��Wַ;��;�E��-��W��?��A�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/��S�1��/��I�A��6��W��/��I�A��6���E��G��?��>��-��E��G��?��>��-/-��G��?�R�1��4��2��T��N��5��=�7��@��P��:��J#!��G��?�1��4��2��T��5��=��@��P��J��E��G��?��>��-��E��G��?��>��-)'��G��?��>��-��P��L΅/��Bڶ>��S��J��@Ԛ< ��G��?��>��-΅/��B��S��J��@Ԛ<���E��G��?��>��-��E��G��?��>��-��G��?��T�4��G��?��T�4��E��G��?��>��-��E��G��?��>��-��G��W��-��T��G��*���E��G��?��>��-��E��G��?��>��-��E�,��G��?�/��-��"��D��:��E��G��?�/��-��"��D��E��G��?��>��-��E��G��?��>��-20��G��?��>��-��G��6��4��?��9ʉ5��;˫N¶;�P��N��T,*��G��?��>��-��G��4��?��9ʉ5��;ΫN�P��N��T���E��G��?��>��-��E��G��?��>��-86��G��?��>��-��2��2΅/��8��B��?¶7ģC��CщQ��D��P�D��A,*��G��?��>��-΅/��8��B��N�CщQ��D��P�D��A��E��G��?��>��-��E��G��?��>��-20����N��E�,��G��?��>��-��?¶7ʡH��W��B��:ģC��O#!����N��E��G��?��>��-��N��W��:�C���E��G��?��>��-��E��G��?��>��-&$��G��?��>��-��8��G��?��>��-��4��-��2#!��G��?��>��-��G��?��>��-��4��-��2��E��G��?��>��-��E��G��?��>��- ��G��?��>��-�/��.��BʭBѡ8¶;��G��?��>��-�/��BʭBѡ8¶;���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>,*��K�=�9��:ׄ9��?��DϪJ��P��>؞C��@��@��@ ��=��:ׄ9��?��DϪJ��Pρ>��@��@���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>��>؞C��1��9��Tׄ9��?��@Ԛ<ρ>��1��9��Tׄ9��?��@Ԛ<���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>Ư8��Hׄ9��?��>؞C��@��@��@Ư8��Hׄ9��?ρ>��@��@���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>ׄ9��?��=��7ׄ9��?��=��7���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>&$��>؞Cׄ9��?��6��R�1��T��D��P�D��A ρ>ׄ9��?��6�1��T��D��P�D��A���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>DB����=����>��1��9��Tׄ9��?ׄ9��B��9��>��>Ư8��I��>؞CбM�8��6><����=����>��1��9��Tׄ9��?ׄ9��B��9��>��>Ư8��Iρ>бM��8���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>#!��UP۴2��>��M��N��,��B��MСG��T��U��P��>��M��N��B��MСG��T���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>hf��1��	��T��Sׄ9��?��A��J�9��J��O��T��,��Q����S��F��>��T��9��P��,��1�R��>؞Cб��:��6�����)��ʪ_]��1��	��T��Sׄ9��?��A˱9��O��T��,��Q����S��F��>��T��9��P��,��1�Rρ>б��:��6�����)��ʪ���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>A?��:��9�1��S��Tׄ9��?��9��M��,��.��T��>��BϪJ�9��>؞C��@��@��@;9��:��9�1��S��Tׄ9��?��9��M��,��.��T��>��BϪJ�9ρ>��@��@�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2#!��C��1����4��>��@��D��2��>��@Ԛ<#!��C��1����4��>��@��D��2��>��@Ԛ<�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2SQ��&����L����4�9��2��IщQP��=��&��1��X��4��B��D��7��1��X��G��:��&������T��6GEީ��L����4�9��2��IщQ��=��&��1��4��B��D��7��1��G��:��&������T��6�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2 ��Cڜ>����4��2��K��.��B��@��K��Cڜ>����4��K��.��@��K�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��6��1��&������6��P��>��4��2��9�Q��1��@����&��@��@��@20��6��1��&������6��P��>��4ƋQ��1��@����&��@��@�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2#!������6��E��4��2��4ڜ>��2����AЍ��6��E��4��4ڜ>��2����A�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2��6�>����4��2��E��X��@��N��6�>����4��E��@��N�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����>��4ڜ>��F��5��@Ԛ<����>��4ڜ>��F��5��@Ԛ<�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��C��R����W��6��?۱U��R��T��:����R��&��6��D��>��6�2��486��C��R����W��6��?۱U��R��T��:����R��&��6��D��>��D��4���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8MK����U��E��;��6֊2��>��W��6��,ϨH��@��F�L��6��,��B��,��T��E��;��>��A�7��BDB��Ǡ2��;��6֊2��>��W��6��,ϨH��@��F�L��6��,��B��,��T��E��;��>��+���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�853�D��U��E��>��W��@��P��2��1�H��S��V��9��;��W��@Ԛ<,*�DǠ2��>��W��@��7��1�H��S��V��9��=��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�886��U��E��2��V��=��L��2��9��6��T��=ȟN��2��D��S��>؞CԚ<20Ǡ2��2��V��=��L��2��9��6��T��=ȟN��2��D��Sρ>Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8JH��>؞C��X��A�N��;��W����S��V��6��2��6��D��U��E��=��W�L��6��,��6��@Ԛ<A?ρ>��X��A�N��=����S��V��6��2��6��DǠ2��=��W�L��6��,��6��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8PN��9��;��2��U��E��D����S��V�1��6��=��G��B��<�6��>؞Cб��:��6�����)��ʪDB��9��;��2Ǡ2��D����S��V�1��6��=��G��<ρ>б��:��6�����)��ʪ���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8DB��D��7��>��U��E��;��A����S��VϨH��,ϨH��W��;��6��2��>��T��6��@Ԛ<><��D��7��>Ǡ2��;��A����S��V؋8ϨH��W��;��6��2��>��T��6��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8;9��>��U��E��;��F��W��O��T��7��,��>��A��8��S��V��D��P�D��A20��>Ǡ2��;��F��W��3��7��,��>��A��S��V��D��P�D��A���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�886����U��E��6�1��6��=��V��6��>��6��L��=��>؞C��@��@��@/-��Ǡ2��6�1��6��=��V��6��>��6��L��=ρ>��@��@���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8)'��>؞C��U��E��;��V��6��2��6��D��S�D��A#!ρ>Ǡ2��;��V��6��2��6��D��S�D��A���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TJH�R��6��>��#����H��L��6��M��9ٟ@��U��V��UӁG��DܤK��8��<��#����@��@��@;9�R��6��>��#��H��L��6��M�@��U��V��UӁG��A��8��<��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T ��U��U��D��,��A��#��%��@��@��@��U��U��D��,��A��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T20��U��N��.��T��5ƛK��,��6�I�1��6��#��%��@��@��@,*��U��N��.��T��5ƛK��,��6�I�1��6��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TDB�9Ԛ<��6ϪJ��>��#����>��Q�@��D��9��D��FҾW��SܤK��#����@��@��@,*��1��6��>��#��>��Q�@��9��FҾW��S��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T20��#��%��>��UӁG��D��9��D��.��7��>��#��%��@��@��@#!��#��>��UӁG��9��.��7��>��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TMK�R��,����9��S��=ɵOʡH��9��B��>��U��U��D��=��UL�9��T��M��#����@��@��@;9�R��,����SɵO��9��>��U��U��D��=��UL�9��T��M��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T&$��DПC��,��UӁG��DܤK��#����@��@��@��DПC��,��UӁG��A��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T&$��E��>��F��#����U��D�K�0��@��@��@ ��E��>��F��#��U��D�K�0��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T,*��#����U��D�K��-щQ��R��Q��#����@��@��@#!��#��U��D�K��-щQ��R��Q��#��@��@���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-DB�;��J��Iٟ@��F������X��H��-��E��DܤK��V�3��E��T�����)��ʪ86��;��@��Fح��X��H��-��E��D��V�3��E��T�����)��ʪ��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-YW��9��T����:��B��7ٟ@��������)��X��-���;��J��%��)ѾC��T��O��7��%��T�8��7��F��D�0A?��9��T����:��B��7ٟ@��X��-����;��)ѾC��T��O��7��%��T��8��F�0���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86�;��J�8��5��S���������X��H��-�8��E��6�O��@Ԛ<&$��;�8��5��S��X��H��-�8��E��6��@Ԛ<��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-&$��5��M�;��J��.��B��7��H��1��R��@Ԛ<#!��5��M��;��.��B��7��H��1��R��@Ԛ<���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-><�;��J�8��5��S���������X��H��-�8��E��6�O��D��S�D��A,*��;�8��5��S��X��H��-�8��E��6��D��S�D��A��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-PN�;��J��A��5��D��N�8��R��8��E��B��S��;��7��6��X��H��-��N��F��K��,��D��P�D��AA?��;��A��5��D��N��R��N��B��S��;��5��X��H��-��N��F��,��D��P�D��A���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-DB�;��J��Iٟ@��F������X��H��-��E��DܤK��V�3��E��T�����)��ʪ86��;��@��Fح��X��H��-��E��D��V�3��E��T�����)��ʪ��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-_]��5��M�;��J��D��Cٟ@��F��2��6��K��:��X��-��R��B��9��S�8��@��D��6��9��>ҾW��D��,��D��P�D��APN��5��M��;��D��@��F��2��6��K��:��X��-��R��B��9��S�8��@��6ߖ>��D��,��D��P�D��A���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86�;��J�8��5��S���������X��H��-�8��E��6�O��@Ԛ<&$��;�8��5��S��X��H��-�8��E��6��@Ԛ<��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-�;��J١-ܤK��S��/��@��N��;١-ܤK��S��@��N���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-><�;��J�8��5��S���������X��H��-�8��E��6�O��D��S�D��A,*��;�8��5��S��X��H��-�8��E��6��D��S�D��A��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86��6��C��;����	��X��-��N��W��H��T�;��J�����)��ʪ/-��6��C��;����X��-��N��W��H��;�����)��ʪ�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�GE��5��D��R��9��3��A��7��.��8��R��AƛK��2��T��H�?��T�!��H��AM�8��6A?��5��D��R��9��3��A��7��8��R��AƛK��2��T��H�?��T�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�86��5��R��Aб�D��9��3��A��7��.��8��R��A���!��@��@��@20��5��R��Aб�D��9��3��A��7��8��R��A���!��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�><ʡH��9��B��R��9����3��A��V��7��.��R��A��ϪJ��H��A��@��@��@20��9��R��9����3��A��V��7��R��A��ϪJ��H��A��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�;9�!��H��A��5��D��R��9��L��9��B��R��7��.��R��ϪJ��,��@Ԛ<86�!��H��A��5��D��R��9��L��9��B��R��7��R��ϪJ��,��@Ԛ<�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�ki��5��D��9��3��A��J��R��7��.��B��R��F��D��3����Bٟ@��7��5��Dٟ@��7��>��H��A��K��A��D��P�!��H��AM�8��6ec��5��D��9��3��A��J��R��7��B��R��F��D��3����Bٟ@��7��5��Dٟ@��7��>��H��A��K��A��D��P�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�b`��5��D��R��9��L��9�D��R������7��.��3��>����3��R��Q��K��U��D��A��-��D��3��D�!��H��AM�8��6\Z��5��D��R��9��L��9�D��R������7��3��>����3��R��Q��K��U��D��A��-��D��3��D�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�/-��5��D��9��L��9�D��7��.��R��ƭI�!��@��@��@)'��5��D��9��L��9�D��7��R��ƭI�!��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�hf��9��5��L��9�D��R��G��7��.��3��A��W�D��E��W��KѾC��H��T��7��H��A��7��:��6�����������)��ʪ��VT��9��5��L��9�D��R��G��7��3��A��W�D��E��W��K��5����:��6�����������)��ʪ���/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�><ϪJ��A��H��A��R��A��9�D��9��3��A��R��A��D��ϪJ��7��.��K��5;9ϪJ��A��H��A��R��A��9�D��9��3��A��R��A��D��ϪJ��7��K��5���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<DB��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��Bʔ7��7��>��P/-��W��<��7��R��:����1��E��P��9�M��Bݔ7��>��P���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��R��-��R��-���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<53��9��.�<��J��O��<��-щQ��.��6��O��.��6��U��7��7��T&$��9�<��J��O��<щQ��.��O��.��UԚ7��T���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<20��.��6��O��3��7��;��0��G����.��6��.��6��<��B��B&$��.��O��3�7��0��G����.��.��<��B��B���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��.��6��O��8�I��6��T��.��O��8�I��6��T���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��.��6��O��T����K��6��.��O��T����K��6���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<20����.��6��O�/��E��E��D��.��6��O�/��E��E��"��W#!������E��D��.��6��O�/��E��"��W���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<DB��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��B��D��S�D��A20��W��<��7��R��:����1��E��P��9�M��B��D��S�D��A���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<><��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��B��@Ԛ<,*��W��<��7��R��:����1��E��P��9�M��B��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��M��>��4��M��5��4Н?��A��3��A��T ��U��D��1ձM��4��M��5��4��A��A�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��C��T��%��8��>��9��S�1��M��E��;)'��U��D��1��C��T��%��8��>��9��S�1��M��;�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1 ��U��D��1��U��3ʡH��W��R�D��U ��U��D��1��U��3ʡH��W��R�D��U�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1 ��U��D��1��M��>��4��M��5��G��3��U��D��1ձM��4��M��5��G�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1/-��U��8��J��D��1��U��H��AʡH����R��G��M��=��T,*��U��8��J��D��1��U��H��AʡH����R��G��M��=�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��U��D��1��F��B��L��L¶7��JѾC��4��W��,��M��4��;#!��U��D��1�B��N��J�C��W��1��4��;�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1&$��U��D��1ʡH��R��:��D��G��AʈO��>��6#!��U��D��1ʡH��R��:��D��G��A��>��6�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��UȂ3��.��1��P��D��>��J١-��-�� ��A��B&$��UȂ3��.��1��P��>��J١-��-�� ��A��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1SQ��>��K��U��9��D��1��M��.��O��G��UʡH��9�>��9��U��1��9��9��>��U��6�9��I��T��@Ԛ<DB��>��K��U��9��D��1��M��.ǼO��U�>��9��U��1��9��>��U��6��I��T��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��9ҧK��1�BPϪJ��>��D ��U��D��1��9ҧK��1�B��J��>��D�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��9ҧK��1�BPϪJ��>��D ��U��D��1��9ҧK��1�B��J��>��D�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��UȂ3��1��M��CP��Q��>��DԃP��E��A��B ��U͂3�MP��Q��>��D�U��A��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��U��D��1P��Rޚ6��H��U��4��9��QÐW��B&$��U��D��1��Rޚ6��H��U��4��9��QÐW��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1&$��U��D��1��W��>β7��UщQ��D��G��@��K ��U��D��1��W��>ƴ7щQ��D��@��K�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1��U��D��1��MʡH��R��H��U��U��D��1��MʡH��R��H��U�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1/-��U��D��1��@��1��G��M��3̛<��:��9��T�����!#!��U��D��1��@��1��G��M��3��:��T��W�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1��U��D��1��>��N��V����N��F��U��D��1��>��N��V����N��F�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��MʡH��W��R��H��U��J��6��J����7,*��U��D��1��MʡH��W��R��H��U��J��6��J����7�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1JH��U��B��M�B��U��D��1��9ҧK��1�BPϪJ��>��D��S��1��U��B��D��B��N��@Ԛ<><��U��M�B��U��D��1��9ҧK��1�B��J��>��Dū1��U��D��B��N��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��AʋM��Q��U��,��D��1��U��>��4��,��3��T��5��=��T&$��A��Q��U��,��D��1��U��>��4��3��5��=�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��153��4��U��D��1��M��/��5��S��7��H��4��7����N����H��)'��4��U��D��1��M��5��S������N����H���	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��U��D��1��U��/��VӲU��>��/��=��W��Q��T ��U��D��1��*ӲU��>��/��=��W��Q�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1DB��U��D��1��UʡH��W��R��6��U��>�G��=��S��U��/��T��(����)������!/-��U��D��1��UʡH��W��R��6��U��>��G��S��U��T���	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��M��>��U��.��6��<��B��B��U��D��1ձM��U��.��<��B��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��U��D��1�K��U��>�1��D��3̛<��2��/ќ6��H��Q��T&$��U��D��1�K��U��>�1��D��3��/��H��Q�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1DB��U��D��1ۓR��4��H��5�BPϪJ��>��D��3��K��T��(������(������!)'��U��D��1ۓR��4��H��5�B��J��>��D��3��K���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8qo��HԼO��R��@��C��/��D��8��>ٟ@��8��	��P��@��N��LΊ;�J��@��>��@��BΊ;��R��P��@��N��LΊ;�J��@��>��D��Oָ:��?��Thf��H����C��/��D��8��>ٟ@��8��	��P��@��N��LΊ;�J��@��>��@��BΊ;��R��P��@��N��LΊ;�J��@��>��Dָ:��?��T���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8nl��HԼO��R��@��C��/��8��>ٟ@��8����P��HۇL��B��D��CɕH��5ǟV��G��R��P��H��G��L��B��D��=ږH��5��D��Oָ:��?��Tec��H����C��/��8��>ٟ@��8����P��HۇL��B��D��CɕH��5ǟV��G��R��P��H��G��L��B��D��=ږH��5��Dָ:��?��T���EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��8}{��M��:İU��;ԼO��R��@��?��R��5��<��D��>��B��D�5��@��E��7��K՞R��W��K��D��5��C��/��8��>ٟ@��>��:��T��(������!��K��;�8��6_]��:����?��R��5��<��D��>��B��D�5��@��E��G՞RʼG��D��5��C��/��8��>ٟ@��>��:��T����K��;�8��6���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8#!ß<��:��Dć?ԼO��@��C��/��8��>ٟ@ ß<��:��Dć?�O��C��/��8��>ٟ@���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8,*��HԼO��R��@��NܒM̺2��C��/��D��8��>ٟ@��#!��H����N�M��C��/��D��8��>ٟ@�����EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��853��HԼO��R��@��C��/��D��8��>ٟ@����R��E��@��>��D��W/-��H����C��/��D��8��>ٟ@����R��E��@��>��D��W���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8&$��EԼO��@��C��/��8��>ٟ@��D��P�D��A#!��E�O��C��/��8��>ٟ@��D��P�D��A���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8����HԼO��R��@��C��/��8��>ٟ@����D��>��A��I��H��!������D��>�H��I��H��:��@��>�Hٟ@��/ў7����:��@՞R��.ٟ@��/ў7��9��O��E��E��Xqo��H����C��/��8��>ٟ@����D��>��A��I��H��!������D��>�H��I��H��:��>�Hٟ@��/ў7����:��>ٟ@��/ў7��9��E��E��X���EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��8;9��HԼO��@��C��/��8��>ٟ@����H��2��9��<��Cԓ4��>�9��2��653��H�O��C��/��8��>ٟ@����H��2��9��<��Cԓ4��>��2��6���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��820��H��?��R��S��H��Iԓ4��>ԼO��@7��C��/��8��>ٟ@)'��H��?��R��S����>�O7��C��/��8��>ٟ@
\ No newline at end of file
diff --git a/paddle/trainer/tests/gen_proto_data.py b/paddle/trainer/tests/gen_proto_data.py
deleted file mode 100644
index 8cc6d44673b9f992c28ae95cc06db5ea5aca0642..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/gen_proto_data.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cStringIO import StringIO
-
-import paddle.proto.DataFormat_pb2 as DataFormat
-from google.protobuf.internal.encoder import _EncodeVarint
-
-import logging
-import pprint
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-
-OOV_POLICY_IGNORE = 0
-OOV_POLICY_USE = 1
-OOV_POLICY_ERROR = 2
-
-num_original_columns = 3
-
-# Feature combination patterns.
-# [[-1,0], [0,0]]  means previous token at column 0 and current token at
-# column 0 are combined as one feature.
-patterns = [
-    [[-2, 0]],
-    [[-1, 0]],
-    [[0, 0]],
-    [[1, 0]],
-    [[2, 0]],
-    [[-1, 0], [0, 0]],
-    [[0, 0], [1, 0]],
-    [[-2, 1]],
-    [[-1, 1]],
-    [[0, 1]],
-    [[1, 1]],
-    [[2, 1]],
-    [[-2, 1], [-1, 1]],
-    [[-1, 1], [0, 1]],
-    [[0, 1], [1, 1]],
-    [[1, 1], [2, 1]],
-    [[-2, 1], [-1, 1], [0, 1]],
-    [[-1, 1], [0, 1], [1, 1]],
-    [[0, 1], [1, 1], [2, 1]],
-]
-
-
-def make_features(sequence):
-    length = len(sequence)
-    num_features = len(sequence[0])
-
-    def get_features(pos):
-        if pos < 0:
-            return ['#B%s' % -pos] * num_features
-        if pos >= length:
-            return ['#E%s' % (pos - length + 1)] * num_features
-        return sequence[pos]
-
-    for i in xrange(length):
-        for pattern in patterns:
-            fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
-            sequence[i].append(fname)
-
-
-'''
-Source file format:
-Each line is for one timestep. The features are separated by space.
-An empty line indicates end of a sequence.
-
-cutoff: a list of numbers. If count of a feature is smaller than this,
- it will be ignored.
-if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
-i-th column.
-
-return a list of dict for each column
-'''
-
-
-def create_dictionaries(filename, cutoff, oov_policy):
-    def add_to_dict(sequence, dicts):
-        num_features = len(dicts)
-        for features in sequence:
-            l = len(features)
-            assert l == num_features, "Wrong number of features " + line
-            for i in xrange(l):
-                if features[i] in dicts[i]:
-                    dicts[i][features[i]] += 1
-                else:
-                    dicts[i][features[i]] = 1
-
-    num_features = len(cutoff)
-    dicts = []
-    for i in xrange(num_features):
-        dicts.append(dict())
-
-    f = open(filename, 'rb')
-
-    sequence = []
-
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            add_to_dict(sequence, dicts)
-            sequence = []
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    for i in xrange(num_features):
-        dct = dicts[i]
-        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
-        todo = []
-        for k, v in dct.iteritems():
-            if v < cutoff[i]:
-                todo.append(k)
-            else:
-                dct[k] = n
-                n += 1
-
-        if oov_policy[i] == OOV_POLICY_USE:
-            # placeholder so that len(dct) will be the number of features
-            # including OOV
-            dct['#OOV#'] = 0
-
-        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
-        for k in todo:
-            del dct[k]
-
-    f.close()
-    return dicts
-
-
-def encode_varint(v):
-    out = StringIO()
-    _EncodeVarint(out.write, v)
-    return out.getvalue()
-
-
-def write_proto(file, message):
-    s = message.SerializeToString()
-    packed_len = encode_varint(len(s))
-    file.write(packed_len + s)
-
-
-'''
-if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
-existed in dicts[i] will be assigned to id 0.
-if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
-in dicts[i].
-'''
-
-
-def gen_proto_file(input_file, dicts, oov_policy, output_file):
-    def write_sequence(out, sequence):
-        num_features = len(dicts)
-        is_beginning = True
-        for features in sequence:
-            assert len(features) == num_features, \
-                "Wrong number of features: " + line
-            sample = DataFormat.DataSample()
-            for i in xrange(num_original_columns):
-                id = dicts[i].get(features[i], -1)
-                if id != -1:
-                    sample.id_slots.append(id)
-                elif oov_policy[i] == OOV_POLICY_IGNORE:
-                    sample.id_slots.append(0xffffffff)
-                elif oov_policy[i] == OOV_POLICY_ERROR:
-                    logger.fatal("Unknown token: %s" % features[i])
-                else:
-                    sample.id_slots.append(0)
-
-            if patterns:
-                dim = 0
-                vec = sample.vector_slots.add()
-                for i in xrange(num_original_columns, num_features):
-                    id = dicts[i].get(features[i], -1)
-                    if id != -1:
-                        vec.ids.append(dim + id)
-                    elif oov_policy[i] == OOV_POLICY_IGNORE:
-                        pass
-                    elif oov_policy[i] == OOV_POLICY_ERROR:
-                        logger.fatal("Unknown token: %s" % features[i])
-                    else:
-                        vec.ids.append(dim + 0)
-
-                    dim += len(dicts[i])
-
-            sample.is_beginning = is_beginning
-            is_beginning = False
-            write_proto(out, sample)
-
-    num_features = len(dicts)
-    f = open(input_file, 'rb')
-    out = open(output_file, 'wb')
-
-    header = DataFormat.DataHeader()
-    if patterns:
-        slot_def = header.slot_defs.add()
-        slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
-        slot_def.dim = sum(
-            [len(dicts[i]) for i in xrange(num_original_columns, len(dicts))])
-        logger.info("feature_dim=%s" % slot_def.dim)
-
-    for i in xrange(num_original_columns):
-        slot_def = header.slot_defs.add()
-        slot_def.type = DataFormat.SlotDef.INDEX
-        slot_def.dim = len(dicts[i])
-
-    write_proto(out, header)
-
-    num_sequences = 0
-    sequence = []
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            write_sequence(out, sequence)
-            sequence = []
-            num_sequences += 1
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    f.close()
-    out.close()
-
-    logger.info("num_sequences=%s" % num_sequences)
-
-
-dict2 = {
-    'B-ADJP': 0,
-    'I-ADJP': 1,
-    'B-ADVP': 2,
-    'I-ADVP': 3,
-    'B-CONJP': 4,
-    'I-CONJP': 5,
-    'B-INTJ': 6,
-    'I-INTJ': 7,
-    'B-LST': 8,
-    'I-LST': 9,
-    'B-NP': 10,
-    'I-NP': 11,
-    'B-PP': 12,
-    'I-PP': 13,
-    'B-PRT': 14,
-    'I-PRT': 15,
-    'B-SBAR': 16,
-    'I-SBAR': 17,
-    'B-UCP': 18,
-    'I-UCP': 19,
-    'B-VP': 20,
-    'I-VP': 21,
-    'O': 22
-}
-
-if __name__ == '__main__':
-    cutoff = [3, 1, 0]
-    cutoff += [3] * len(patterns)
-    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
-    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
-    dicts = create_dictionaries('trainer/tests/train.txt', cutoff, oov_policy)
-    dicts[2] = dict2
-    gen_proto_file('trainer/tests/train.txt', dicts, oov_policy,
-                   'trainer/tests/train_proto.bin')
-    gen_proto_file('trainer/tests/test.txt', dicts, oov_policy,
-                   'trainer/tests/test_proto.bin')
diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list
deleted file mode 100644
index 703e87753d5a4f507aad11a6d875cea44787667b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/mnist.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/mnist_bin_part
diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part
deleted file mode 100644
index 08b93a0ebb5698bdafbc36c3c757918a50bab621..0000000000000000000000000000000000000000
Binary files a/paddle/trainer/tests/mnist_bin_part and /dev/null differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
deleted file mode 100644
index f189b21e86a50d70d317b5e43aa2d6e05af5e774..0000000000000000000000000000000000000000
Binary files a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data and /dev/null differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
deleted file mode 100644
index 6b406dff0ba91b5f310d7eafa111c0d21d6542c3..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ /dev/null
@@ -1 +0,0 @@
-./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
diff --git a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
deleted file mode 100644
index 92f32a18c0068ab4672034a270aa8c52f2716d59..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
+++ /dev/null
@@ -1,154 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 999
-l1 = 0
-l2 = 0
-
-model_type("nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train_sparse.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 32
-layer2_dim = 16
-layer3_dim = 16
-hidden_dim = 32
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
deleted file mode 100644
index b1744db8d604c88ec47e7104f79b38bb9d0e4442..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-fc1 = fc_layer(input=data, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=fc1, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc1, fc2], size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=1)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
deleted file mode 100644
index b1744db8d604c88ec47e7104f79b38bb9d0e4442..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-fc1 = fc_layer(input=data, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=fc1, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc1, fc2], size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=1)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
deleted file mode 100644
index d19222360c2f424ddb306b155dfef07921098a6b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
+++ /dev/null
@@ -1,154 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 1451594
-l1 = 0
-l2 = 0
-
-model_type("nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/sample_trainer_config_rnn.conf b/paddle/trainer/tests/sample_trainer_config_rnn.conf
deleted file mode 100644
index b720d4d5a6ca59e207832a8c5410c2cb6074c439..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_rnn.conf
+++ /dev/null
@@ -1,180 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 1451594
-l1 = 0
-l2 = 0
-
-model_type("recurrent_nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def SimpleRecurrentLayer(name, 
-                         size, 
-                         active_type, 
-                         bias, 
-                         input_layer_name, 
-                         parameter_name,
-                         seq_reversed = False):
-    RecurrentLayerGroupBegin(name + "_layer_group", 
-                             in_links=[input_layer_name], 
-                             out_links=[name],
-                             seq_reversed=seq_reversed)
-    memory_name = Memory(name=name, size=size)
-    Layer(
-        name = name,
-        type = "mixed",
-        size = size,
-        active_type = active_type,
-        bias = bias,
-        inputs = [IdentityProjection(input_layer_name),
-                  FullMatrixProjection(memory_name,
-                                       parameter_name = parameter_name,
-                                       ),
-                  ]
-        )
-    RecurrentLayerGroupEnd(name + "_layer_group")
-
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        SimpleRecurrentLayer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            size = hidden_dim,
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            input_layer_name = slot_names[i] + "_embedding_" + network_name,
-            parameter_name = "rnn1.w0",
-            )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/test.txt b/paddle/trainer/tests/test.txt
deleted file mode 100644
index 3ad503b34f2e1a84c632d0894f180b5cf9ac550a..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test.txt
+++ /dev/null
@@ -1,1000 +0,0 @@
-Confidence NN B-NP
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-is VBZ B-VP
-widely RB I-VP
-expected VBN I-VP
-to TO I-VP
-take VB I-VP
-another DT B-NP
-sharp JJ I-NP
-dive NN I-NP
-if IN B-SBAR
-trade NN B-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-, , O
-due JJ B-ADJP
-for IN B-PP
-release NN B-NP
-tomorrow NN B-NP
-, , O
-fail VB B-VP
-to TO I-VP
-show VB I-VP
-a DT B-NP
-substantial JJ I-NP
-improvement NN I-NP
-from IN B-PP
-July NNP B-NP
-and CC I-NP
-August NNP I-NP
-'s POS B-NP
-near-record JJ I-NP
-deficits NNS I-NP
-. . O
-
-Chancellor NNP O
-of IN B-PP
-the DT B-NP
-Exchequer NNP I-NP
-Nigel NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-restated VBN I-NP
-commitment NN I-NP
-to TO B-PP
-a DT B-NP
-firm NN I-NP
-monetary JJ I-NP
-policy NN I-NP
-has VBZ B-VP
-helped VBN I-VP
-to TO I-VP
-prevent VB I-VP
-a DT B-NP
-freefall NN I-NP
-in IN B-PP
-sterling NN B-NP
-over IN B-PP
-the DT B-NP
-past JJ I-NP
-week NN I-NP
-. . O
-
-But CC O
-analysts NNS B-NP
-reckon VBP B-VP
-underlying VBG B-NP
-support NN I-NP
-for IN B-PP
-sterling NN B-NP
-has VBZ B-VP
-been VBN I-VP
-eroded VBN I-VP
-by IN B-PP
-the DT B-NP
-chancellor NN I-NP
-'s POS B-NP
-failure NN I-NP
-to TO B-VP
-announce VB I-VP
-any DT B-NP
-new JJ I-NP
-policy NN I-NP
-measures NNS I-NP
-in IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-last JJ B-NP
-Thursday NNP I-NP
-. . O
-
-This DT B-NP
-has VBZ B-VP
-increased VBN I-VP
-the DT B-NP
-risk NN I-NP
-of IN B-PP
-the DT B-NP
-government NN I-NP
-being VBG B-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-base NN B-NP
-rates NNS I-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-from IN B-PP
-their PRP$ B-NP
-current JJ I-NP
-15 CD I-NP
-% NN I-NP
-level NN I-NP
-to TO B-VP
-defend VB I-VP
-the DT B-NP
-pound NN I-NP
-, , O
-economists NNS B-NP
-and CC O
-foreign JJ B-NP
-exchange NN I-NP
-market NN I-NP
-analysts NNS I-NP
-say VBP B-VP
-. . O
-
-`` `` O
-The DT B-NP
-risks NNS I-NP
-for IN B-PP
-sterling NN B-NP
-of IN B-PP
-a DT B-NP
-bad JJ I-NP
-trade NN I-NP
-figure NN I-NP
-are VBP B-VP
-very RB B-ADVP
-heavily RB I-ADVP
-on IN B-PP
-the DT B-NP
-down JJ I-NP
-side NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Chris NNP B-NP
-Dillow NNP I-NP
-, , O
-senior JJ B-NP
-U.K. NNP I-NP
-economist NN I-NP
-at IN B-PP
-Nomura NNP B-NP
-Research NNP I-NP
-Institute NNP I-NP
-. . O
-
-`` `` O
-If IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-another DT B-NP
-bad JJ I-NP
-trade NN I-NP
-number NN I-NP
-, , O
-there EX B-NP
-could MD B-VP
-be VB I-VP
-an DT B-NP
-awful JJ I-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-, , O
-'' '' O
-noted VBD B-VP
-Simon NNP B-NP
-Briscoe NNP I-NP
-, , O
-U.K. NNP B-NP
-economist NN I-NP
-for IN B-PP
-Midland NNP B-NP
-Montagu NNP I-NP
-, , O
-a DT B-NP
-unit NN I-NP
-of IN B-PP
-Midland NNP B-NP
-Bank NNP I-NP
-PLC NNP I-NP
-. . O
-
-Forecasts NNS B-NP
-for IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-range VBP B-VP
-widely RB B-ADVP
-, , O
-but CC O
-few JJ B-NP
-economists NNS I-NP
-expect VBP B-VP
-the DT B-NP
-data NNS I-NP
-to TO B-VP
-show VB I-VP
-a DT B-NP
-very RB I-NP
-marked VBN I-NP
-improvement NN I-NP
-from IN B-PP
-the DT O
-# # O
-2 CD O
-billion CD O
--LRB- ( O
-$ $ B-ADJP
-3.2 CD O
-billion CD O
--RRB- ) O
-deficit NN B-NP
-in IN B-PP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-reported VBD B-VP
-for IN B-PP
-August NNP B-NP
-. . O
-
-The DT B-NP
-August NNP I-NP
-deficit NN I-NP
-and CC O
-the DT B-NP
-# # I-NP
-2.2 CD I-NP
-billion CD I-NP
-gap NN I-NP
-registered VBN B-VP
-in IN B-PP
-July NNP B-NP
-are VBP B-VP
-topped VBN I-VP
-only RB B-ADVP
-by IN B-PP
-the DT B-NP
-# # I-NP
-2.3 CD I-NP
-billion CD I-NP
-deficit NN I-NP
-of IN B-PP
-October NNP B-NP
-1988 CD I-NP
-. . O
-
-Sanjay NNP B-NP
-Joshi NNP I-NP
-, , O
-European JJ B-NP
-economist NN I-NP
-at IN B-PP
-Baring NNP B-NP
-Brothers NNPS I-NP
-& CC I-NP
-Co. NNP I-NP
-, , O
-said VBD B-VP
-there EX B-NP
-is VBZ B-VP
-no DT B-NP
-sign NN I-NP
-that IN B-SBAR
-Britain NNP B-NP
-'s POS B-NP
-manufacturing NN I-NP
-industry NN I-NP
-is VBZ B-VP
-transforming VBG I-VP
-itself PRP B-NP
-to TO B-VP
-boost VB I-VP
-exports NNS B-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-same JJ I-NP
-time NN I-NP
-, , O
-he PRP B-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-pessimistic JJ I-ADJP
-about IN B-PP
-the DT B-NP
-outlook NN I-NP
-for IN B-PP
-imports NNS B-NP
-, , O
-given VBN B-PP
-continued VBD B-NP
-high JJ I-NP
-consumer NN I-NP
-and CC I-NP
-capital NN I-NP
-goods NNS I-NP
-inflows NNS I-NP
-. . O
-
-He PRP B-NP
-reckons VBZ B-VP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-deficit NN I-NP
-will MD B-VP
-narrow VB I-VP
-to TO B-PP
-only RB B-NP
-# # I-NP
-1.8 CD I-NP
-billion CD I-NP
-in IN B-PP
-September NNP B-NP
-. . O
-
-However RB B-ADVP
-, , O
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-he PRP B-NP
-believes VBZ B-VP
-that IN B-SBAR
-a DT B-NP
-reduction NN I-NP
-in IN B-PP
-raw JJ B-NP
-material NN I-NP
-stockbuilding VBG I-NP
-by IN B-PP
-industry NN B-NP
-could MD B-VP
-lead VB I-VP
-to TO B-PP
-a DT B-NP
-sharp JJ I-NP
-drop NN I-NP
-in IN B-PP
-imports NNS B-NP
-. . O
-
-Combined VBN B-PP
-with IN B-PP
-at IN B-ADVP
-least JJS I-ADVP
-some DT B-NP
-rebound NN I-NP
-in IN B-PP
-exports NNS B-NP
-after IN B-PP
-August NNP B-NP
-'s POS B-NP
-unexpected JJ I-NP
-decline NN I-NP
-, , O
-the DT B-NP
-deficit NN I-NP
-could MD B-VP
-narrow VB I-VP
-to TO B-PP
-as RB B-NP
-little JJ I-NP
-as IN I-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-. . O
-
-Mr. NNP B-NP
-Briscoe NNP I-NP
-, , O
-who WP B-NP
-also RB B-ADVP
-forecasts VBZ B-VP
-a DT B-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-current JJ I-NP
-account NN I-NP
-gap NN I-NP
-, , O
-warns VBZ B-VP
-that IN B-SBAR
-even RB B-SBAR
-if IN I-SBAR
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-are VBP B-VP
-bullish JJ B-ADJP
-for IN B-PP
-sterling NN B-NP
-, , O
-the DT B-NP
-currency NN I-NP
-wo MD B-VP
-n't RB I-VP
-advance VB I-VP
-much JJ B-NP
-because IN B-SBAR
-investors NNS B-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-see VB I-VP
-further JJ B-NP
-evidence NN I-NP
-of IN B-PP
-the DT B-NP
-turnaround NN I-NP
-before IN B-PP
-adjusting VBG B-VP
-positions NNS B-NP
-. . O
-
-Nevertheless RB B-ADVP
-, , O
-he PRP B-NP
-noted VBD B-VP
-, , O
-`` `` O
-No DT B-NP
-one PRP I-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-go VB I-VP
-into IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-without IN B-PP
-a DT B-NP
-flat JJ I-NP
-position NN I-NP
-'' '' O
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-. . O
-
-Meanwhile RB B-ADVP
-, , O
-overall JJ B-NP
-evidence NN I-NP
-on IN B-PP
-the DT B-NP
-economy NN I-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-clouded VBN I-ADJP
-. . O
-
-In IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-, , O
-Mr. NNP B-NP
-Lawson NNP I-NP
-warned VBD B-VP
-that IN B-SBAR
-a DT B-NP
-further JJ I-NP
-slowdown NN I-NP
-can MD B-VP
-be VB I-VP
-expected VBN I-VP
-as IN B-SBAR
-the DT B-NP
-impact NN I-NP
-of IN B-PP
-the DT B-NP
-last JJ I-NP
-rise NN I-NP
-in IN B-PP
-interest NN B-NP
-rates NNS I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-takes VBZ B-VP
-effect NN B-NP
-. . O
-
-U.K. JJ B-NP
-base NN I-NP
-rates NNS I-NP
-are VBP B-VP
-at IN B-PP
-their PRP$ B-NP
-highest JJS I-NP
-level NN I-NP
-in IN B-PP
-eight CD B-NP
-years NNS I-NP
-. . O
-
-But CC O
-consumer NN B-NP
-expenditure NN I-NP
-data NNS I-NP
-released VBD B-VP
-Friday NNP B-NP
-do VBP B-VP
-n't RB I-VP
-suggest VB I-VP
-that IN B-SBAR
-the DT B-NP
-U.K. NNP I-NP
-economy NN I-NP
-is VBZ B-VP
-slowing VBG I-VP
-that DT B-ADVP
-quickly RB I-ADVP
-. . O
-
-The DT B-NP
-figures NNS I-NP
-show VBP B-VP
-that DT O
-spending NN B-NP
-rose VBD B-VP
-0.1 CD B-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-third JJ I-NP
-quarter NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-and CC O
-was VBD B-VP
-up IN B-ADVP
-3.8 CD B-NP
-% NN I-NP
-from IN B-PP
-a DT B-NP
-year NN I-NP
-ago RB B-ADVP
-. . O
-
-This DT B-NP
-compares VBZ B-VP
-with IN B-PP
-a DT B-NP
-1.6 CD I-NP
-% NN I-NP
-rise NN I-NP
-in IN B-PP
-the DT B-NP
-second NN I-NP
-from IN B-PP
-the DT B-NP
-first JJ I-NP
-quarter NN I-NP
-and CC O
-a DT B-NP
-5.4 CD I-NP
-% NN I-NP
-increase NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-of IN B-PP
-1988 CD B-NP
-. . O
-
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-the DT B-NP
-data NNS I-NP
-show VBP B-VP
-the DT B-NP
-economy NN I-NP
-`` `` O
-is VBZ B-VP
-still RB B-ADVP
-quite RB B-ADJP
-strong JJ I-ADJP
-, , O
-'' '' O
-but CC O
-suggestions NNS B-NP
-that IN B-SBAR
-much NN B-NP
-of IN B-PP
-the DT B-NP
-spending NN I-NP
-went VBD B-VP
-on IN B-PP
-services NNS B-NP
-rather RB B-PP
-than IN I-PP
-consumer NN B-NP
-goods NNS I-NP
-should MD B-VP
-reduce VB I-VP
-fears NNS B-NP
-of IN B-PP
-more JJR B-NP
-import NN I-NP
-rises NNS I-NP
-. . O
-
-Certainly RB B-ADVP
-, , O
-the DT B-NP
-chancellor NN I-NP
-has VBZ B-VP
-made VBN I-VP
-it PRP B-NP
-clear JJ B-ADJP
-that IN B-SBAR
-he PRP B-NP
-is VBZ B-VP
-prepared VBN I-VP
-to TO I-VP
-increase VB I-VP
-interest NN B-NP
-rates NNS I-NP
-again RB B-ADVP
-if IN B-SBAR
-necessary JJ B-ADJP
-to TO B-VP
-both DT I-VP
-ensure VB I-VP
-that IN B-SBAR
-a DT B-NP
-substantial JJ I-NP
-slowdown NN I-NP
-does VBZ B-VP
-take VB I-VP
-place NN B-NP
-and CC O
-that DT O
-sterling NN B-NP
-does VBZ B-VP
-n't RB I-VP
-decline VB I-VP
-further JJ B-ADVP
-. . O
-
-Thursday NNP B-NP
-, , O
-he PRP B-NP
-reminded VBD B-VP
-his PRP$ B-NP
-audience NN I-NP
-that IN B-SBAR
-the DT B-NP
-government NN I-NP
-`` `` O
-can MD B-VP
-not RB I-VP
-allow VB I-VP
-the DT B-NP
-necessary JJ I-NP
-rigor NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-to TO B-VP
-be VB I-VP
-undermined VBN I-VP
-by IN B-PP
-exchange NN B-NP
-rate NN I-NP
-weakness NN I-NP
-. . O
-'' '' O
-
-Analysts NNS B-NP
-agree VBP B-VP
-there EX B-NP
-is VBZ B-VP
-little JJ B-NP
-holding NN B-VP
-sterling NN B-NP
-firm NN B-ADJP
-at IN B-PP
-the DT B-NP
-moment NN I-NP
-other JJ B-ADJP
-than IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-that IN B-SBAR
-rates NNS B-NP
-will MD B-VP
-be VB I-VP
-pushed VBN I-VP
-higher JJR B-ADJP
-if IN B-SBAR
-necessary JJ B-ADJP
-. . O
-
-And CC O
-, , O
-they PRP B-NP
-warn VBP B-VP
-, , O
-any DT B-NP
-further JJ I-NP
-drop NN I-NP
-in IN B-PP
-the DT B-NP
-government NN I-NP
-'s POS B-NP
-popularity NN I-NP
-could MD B-VP
-swiftly RB I-VP
-make VB I-VP
-this DT B-NP
-promise NN I-NP
-sound NN B-VP
-hollow JJ B-ADJP
-. . O
-
-Sterling NNP B-NP
-was VBD B-VP
-already RB I-VP
-showing VBG I-VP
-some DT B-NP
-signs NNS I-NP
-of IN B-PP
-a DT B-NP
-lack NN I-NP
-of IN B-PP
-confidence NN B-NP
-in IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-Friday NNP B-NP
-. . O
-
-In IN B-PP
-European JJ B-NP
-trading NN I-NP
-it PRP B-NP
-declined VBD B-VP
-to TO B-PP
-$ $ B-NP
-1.5890 CD I-NP
-and CC O
-2.9495 CD B-NP
-marks NNS I-NP
-from IN B-PP
-$ $ B-NP
-1.5940 CD I-NP
-and CC O
-2.9429 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-Economists NNS B-NP
-suggested VBD B-VP
-that IN B-SBAR
-if IN B-SBAR
-the DT B-NP
-pound NN I-NP
-falls VBZ B-VP
-much JJ B-NP
-below IN B-PP
-2.90 CD B-NP
-marks NNS I-NP
-, , O
-the DT B-NP
-government NN I-NP
-will MD B-VP
-be VB I-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-rates NNS B-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-, , O
-both DT B-VP
-to TO I-VP
-halt VB B-VP
-any DT B-NP
-further JJ I-NP
-decline NN I-NP
-and CC O
-ensure VB B-VP
-that IN B-SBAR
-the DT B-NP
-balance NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-remains VBZ B-VP
-unchanged JJ B-ADJP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-Market NNP I-NP
-Activity NN I-NP
-
-The DT B-NP
-dollar NN I-NP
-posted VBD B-VP
-gains NNS B-NP
-in IN B-PP
-quiet JJ B-NP
-trading NN I-NP
-as IN B-SBAR
-concerns NNS B-NP
-about IN B-PP
-equities NNS B-NP
-abated VBN B-VP
-. . O
-
-Foreign JJ B-NP
-exchange NN I-NP
-dealers NNS I-NP
-said VBD B-VP
-that IN B-SBAR
-the DT B-NP
-currency NN I-NP
-market NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-distance VB I-VP
-itself PRP B-NP
-from IN B-PP
-the DT B-NP
-volatile JJ I-NP
-stock NN I-NP
-exchange NN I-NP
-, , O
-which WDT B-NP
-has VBZ B-VP
-preoccupied VBN I-VP
-the DT B-NP
-market NN I-NP
-since IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-, , O
-when WRB B-ADVP
-the DT B-NP
-Dow NNP I-NP
-Jones NNP I-NP
-Industrial NNP I-NP
-Average NNP I-NP
-plunged VBD B-VP
-more JJR B-NP
-than IN I-NP
-190 CD I-NP
-points NNS I-NP
-. . O
-
-Currency NN B-NP
-analysts NNS I-NP
-predict VBP B-VP
-that IN B-SBAR
-in IN B-PP
-the DT B-NP
-coming VBG I-NP
-week NN I-NP
-the DT B-NP
-foreign JJ I-NP
-exchange NN I-NP
-market NN I-NP
-will MD B-VP
-shift VB I-VP
-its PRP$ B-NP
-focus NN I-NP
-back RB B-ADVP
-to TO B-PP
-economic JJ B-NP
-fundamentals NNS I-NP
-, , O
-keeping VBG B-VP
-a DT B-NP
-close NN I-NP
-eye NN I-NP
-out IN B-ADVP
-for IN B-PP
-any DT B-NP
-signs NNS I-NP
-of IN B-PP
-monetary JJ B-NP
-easing NN I-NP
-by IN B-PP
-U.S. NNP B-NP
-Federal NNP I-NP
-Reserve NNP I-NP
-. . O
-
-Late RB B-ADVP
-in IN B-PP
-the DT B-NP
-New NNP I-NP
-York NNP I-NP
-trading NN I-NP
-day NN I-NP
-, , O
-the DT B-NP
-dollar NN I-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-1.8578 CD B-NP
-marks NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-1.8470 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-was VBD B-VP
-also RB I-VP
-changing VBG I-VP
-hands NNS B-NP
-at IN B-PP
-142.43 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-141.70 CD B-NP
-yen NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-In IN B-PP
-Tokyo NNP B-NP
-on IN B-PP
-Monday NNP B-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-opened VBD B-VP
-for IN B-PP
-trading NN B-NP
-at IN B-PP
-141.95 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-Tokyo NNP I-NP
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py
index 2c29a274339747b78fbd6c27ae4070f0abbd4028..a76eeeacb91cdba305d2f71c6292f79e4b98dd73 100644
--- a/paddle/trainer/tests/testPyDataWrapper.py
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -20,28 +20,6 @@ import random
 import json
 import string
 
-
-@provider(slots=[
-    SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
-    IndexSlot(3)
-])
-def processNonSequenceData(obj, filename):
-    with open(filename, "rb") as f:
-        for line in f:
-            slots_str = line.split(';')
-            index = int(slots_str[0])
-            non_values = map(int, slots_str[1].split()[1:])
-            dense = map(float, slots_str[2].split()[1:])
-            strs = slots_str[4].strip().split(' ', 1)[1]
-
-            def __values_mapper__(s):
-                s = s.split(":")
-                return int(s[0]), float(s[1])
-
-            values = map(__values_mapper__, slots_str[3].split()[1:])
-            yield [non_values, dense, values, strs, index]
-
-
 SPARSE_ID_LIMIT = 1000
 SPARSE_ID_COUNT = 100
 SEQUENCE_LIMIT = 50
@@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name):
 
 
 if __name__ == "__main__":
-    pvd = processNonSequenceData("test.txt")
-    print pvd.getNextBatch(100)
     pvd = processSeqAndGenerateData("_")
     print pvd.getNextBatch(100)
     pvd = processSubSeqAndGenerateData("_")
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
deleted file mode 100644
index 383505f8131264844069d6f0fa13f4e0ac1f97af..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-DEFINE_bool(need_high_accuracy,
-            true,
-            "whether need to run in double accuracy (recommended)");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.train();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
-  LOG(INFO) << "\n\ntraining of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
-  LOG(INFO) << "\n\ntraining of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-4;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-7;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 66ec65e340a435a7260028611828fb28845e0728..92dc8aa9ec5ce281d1950d84260c1b9555e686a7 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -25,45 +25,9 @@ limitations under the License. */
 #include <unordered_set>
 #include "picojson.h"
 
-void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
 void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
 const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
 
-TEST(PyDataProviderWrapper, NoSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module(std::string("testPyDataWrapper"));
-  conf.set_load_data_object(std::string("processNonSequenceData"));
-  conf.set_async_load_data(false);
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-
-  paddle::DataConfig conf2;
-  conf2.set_type("proto");
-  conf2.set_async_load_data(false);
-  conf2.clear_files();
-  conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist");
-
-  provider.reset(paddle::DataProvider::create(conf2, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromProto;
-  provider->getNextBatch(100, &batchFromProto);
-
-  std::vector<paddle::Argument>& pyArguments = batchFromPy.getStreams();
-  std::vector<paddle::Argument>& protoArguments = batchFromProto.getStreams();
-  EXPECT_EQ(pyArguments.size(), protoArguments.size());
-
-  for (size_t i = 0; i < pyArguments.size(); ++i) {
-    checkEqual(protoArguments[i], pyArguments[i]);
-  }
-}
-
 TEST(PyDataProviderWrapper, SequenceData) {
   paddle::DataConfig conf;
   conf.set_type("py");
@@ -148,66 +112,6 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
-void checkEqual(const paddle::Argument& expect,
-                const paddle::Argument& actual) {
-  if (expect.value) {
-    EXPECT_TRUE(actual.value != nullptr);
-    paddle::Matrix* e = expect.value.get();
-    paddle::Matrix* a = actual.value.get();
-    EXPECT_EQ(e->getWidth(), a->getWidth());
-    EXPECT_EQ(e->getHeight(), a->getHeight());
-    if (dynamic_cast<paddle::CpuSparseMatrix*>(e)) {
-      paddle::CpuSparseMatrix* se = dynamic_cast<paddle::CpuSparseMatrix*>(e);
-      paddle::CpuSparseMatrix* sa = dynamic_cast<paddle::CpuSparseMatrix*>(a);
-      EXPECT_EQ(se->getFormat(), sa->getFormat());
-      EXPECT_EQ(se->getElementCnt(), sa->getElementCnt());
-      size_t rowSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getElementCnt()
-                           : se->getHeight() + 1;
-      size_t colSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getWidth() + 1
-                           : se->getElementCnt();
-      for (size_t i = 0; i < rowSize; ++i) {
-        EXPECT_EQ(se->getRows()[i], sa->getRows()[i]);
-      }
-      for (size_t i = 0; i < colSize; ++i) {
-        EXPECT_EQ(se->getCols()[i], sa->getCols()[i]);
-      }
-      if (se->getValueType() == paddle::FLOAT_VALUE) {
-        EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType());
-        for (size_t i = 0; i < se->getElementCnt(); ++i) {
-          EXPECT_EQ(se->getValue()[i], sa->getValue()[i]);
-        }
-      }
-    } else if (dynamic_cast<paddle::CpuMatrix*>(e)) {
-      EXPECT_EQ(e->getElementCnt(), a->getElementCnt());
-      for (size_t i = 0; i < e->getElementCnt(); ++i) {
-        EXPECT_EQ(e->getData()[i], a->getData()[i]);
-      }
-    }
-  }
-
-  if (expect.ids) {
-    EXPECT_TRUE(actual.ids != nullptr);
-    paddle::VectorT<int>* e = expect.ids.get();
-    paddle::VectorT<int>* a = actual.ids.get();
-    EXPECT_EQ(e->getSize(), a->getSize());
-    for (size_t i = 0; i < e->getSize(); ++i) {
-      EXPECT_EQ(e->getData()[i], a->getData()[i]);
-    }
-  }
-
-  if (expect.strs) {
-    EXPECT_TRUE(actual.strs != nullptr);
-    std::vector<std::string>* e = expect.strs.get();
-    std::vector<std::string>* a = actual.strs.get();
-    EXPECT_EQ(e->size(), a->size());
-    for (size_t i = 0; i < e->size(); ++i) {
-      EXPECT_EQ((*e)[i], (*a)[i]);
-    }
-  }
-}
-
 void checkValue(std::vector<paddle::Argument>& arguments,
                 picojson::array& arr) {
   // CHECK SLOT 0, Sparse Value.
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 425b3d10a38086463784ba2a18db1293efe96e92..394038cf730f13cb957fbbc5ae0e5719b8fe9db6 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -24,7 +24,6 @@ using namespace std;     // NOLINT
 static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
     "trainer/tests/sample_trainer_config_hsigmoid.conf";
-static const string& configFile3 = "trainer/tests/chunking.conf";
 static const string& configFile4 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
@@ -95,13 +94,6 @@ TEST(checkGradient, multi) {
 
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
-TEST(checkGradient, chunk) {
-  checkGradientTest(configFile3, false, false);
-#ifdef PADDLE_WITH_CUDA
-  checkGradientTest(configFile3, true, true);
-#endif
-}
-
 TEST(checkGradient, non_parallel) {
   checkGradientTest(configFile4, false, false);
 }
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
index d1bb9b877fe26702948586dbe90b9ff0ee27c1d6..2f86aaa75316fa2a5a28edfef31c01e15a44b3d0 100644
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -15,12 +15,7 @@
 
 from paddle.trainer_config_helpers import *
 
-TrainData(ProtoData(
-    files = "dummy_list",
-    constant_slots = [1.0],
-    async_load_data = True))
-
-TestData(SimpleData(
+TrainData(SimpleData(
     files = "trainer/tests/sample_filelist.txt",
     feat_dim = 3,
     context_len = 0,
diff --git a/paddle/trainer/tests/test_files.txt b/paddle/trainer/tests/test_files.txt
deleted file mode 100644
index 49002677a848c499610d5e869ce61efb2105e3c8..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_files.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/test_proto.bin
diff --git a/paddle/trainer/tests/train.list b/paddle/trainer/tests/train.list
deleted file mode 100644
index f41e8e8893de6068deb43b08ec6a3bcdd4039326..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/train.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/data_bin_part
diff --git a/paddle/trainer/tests/train.txt b/paddle/trainer/tests/train.txt
deleted file mode 100644
index 2313aee987ba71ba7ea779d3cf7705478e7fbde2..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/train.txt
+++ /dev/null
@@ -1,5000 +0,0 @@
-Confidence NN B-NP
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-is VBZ B-VP
-widely RB I-VP
-expected VBN I-VP
-to TO I-VP
-take VB I-VP
-another DT B-NP
-sharp JJ I-NP
-dive NN I-NP
-if IN B-SBAR
-trade NN B-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-, , O
-due JJ B-ADJP
-for IN B-PP
-release NN B-NP
-tomorrow NN B-NP
-, , O
-fail VB B-VP
-to TO I-VP
-show VB I-VP
-a DT B-NP
-substantial JJ I-NP
-improvement NN I-NP
-from IN B-PP
-July NNP B-NP
-and CC I-NP
-August NNP I-NP
-'s POS B-NP
-near-record JJ I-NP
-deficits NNS I-NP
-. . O
-
-Chancellor NNP O
-of IN B-PP
-the DT B-NP
-Exchequer NNP I-NP
-Nigel NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-restated VBN I-NP
-commitment NN I-NP
-to TO B-PP
-a DT B-NP
-firm NN I-NP
-monetary JJ I-NP
-policy NN I-NP
-has VBZ B-VP
-helped VBN I-VP
-to TO I-VP
-prevent VB I-VP
-a DT B-NP
-freefall NN I-NP
-in IN B-PP
-sterling NN B-NP
-over IN B-PP
-the DT B-NP
-past JJ I-NP
-week NN I-NP
-. . O
-
-But CC O
-analysts NNS B-NP
-reckon VBP B-VP
-underlying VBG B-NP
-support NN I-NP
-for IN B-PP
-sterling NN B-NP
-has VBZ B-VP
-been VBN I-VP
-eroded VBN I-VP
-by IN B-PP
-the DT B-NP
-chancellor NN I-NP
-'s POS B-NP
-failure NN I-NP
-to TO B-VP
-announce VB I-VP
-any DT B-NP
-new JJ I-NP
-policy NN I-NP
-measures NNS I-NP
-in IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-last JJ B-NP
-Thursday NNP I-NP
-. . O
-
-This DT B-NP
-has VBZ B-VP
-increased VBN I-VP
-the DT B-NP
-risk NN I-NP
-of IN B-PP
-the DT B-NP
-government NN I-NP
-being VBG B-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-base NN B-NP
-rates NNS I-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-from IN B-PP
-their PRP$ B-NP
-current JJ I-NP
-15 CD I-NP
-% NN I-NP
-level NN I-NP
-to TO B-VP
-defend VB I-VP
-the DT B-NP
-pound NN I-NP
-, , O
-economists NNS B-NP
-and CC O
-foreign JJ B-NP
-exchange NN I-NP
-market NN I-NP
-analysts NNS I-NP
-say VBP B-VP
-. . O
-
-`` `` O
-The DT B-NP
-risks NNS I-NP
-for IN B-PP
-sterling NN B-NP
-of IN B-PP
-a DT B-NP
-bad JJ I-NP
-trade NN I-NP
-figure NN I-NP
-are VBP B-VP
-very RB B-ADVP
-heavily RB I-ADVP
-on IN B-PP
-the DT B-NP
-down JJ I-NP
-side NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Chris NNP B-NP
-Dillow NNP I-NP
-, , O
-senior JJ B-NP
-U.K. NNP I-NP
-economist NN I-NP
-at IN B-PP
-Nomura NNP B-NP
-Research NNP I-NP
-Institute NNP I-NP
-. . O
-
-`` `` O
-If IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-another DT B-NP
-bad JJ I-NP
-trade NN I-NP
-number NN I-NP
-, , O
-there EX B-NP
-could MD B-VP
-be VB I-VP
-an DT B-NP
-awful JJ I-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-, , O
-'' '' O
-noted VBD B-VP
-Simon NNP B-NP
-Briscoe NNP I-NP
-, , O
-U.K. NNP B-NP
-economist NN I-NP
-for IN B-PP
-Midland NNP B-NP
-Montagu NNP I-NP
-, , O
-a DT B-NP
-unit NN I-NP
-of IN B-PP
-Midland NNP B-NP
-Bank NNP I-NP
-PLC NNP I-NP
-. . O
-
-Forecasts NNS B-NP
-for IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-range VBP B-VP
-widely RB B-ADVP
-, , O
-but CC O
-few JJ B-NP
-economists NNS I-NP
-expect VBP B-VP
-the DT B-NP
-data NNS I-NP
-to TO B-VP
-show VB I-VP
-a DT B-NP
-very RB I-NP
-marked VBN I-NP
-improvement NN I-NP
-from IN B-PP
-the DT O
-# # O
-2 CD O
-billion CD O
--LRB- ( O
-$ $ B-ADJP
-3.2 CD O
-billion CD O
--RRB- ) O
-deficit NN B-NP
-in IN B-PP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-reported VBD B-VP
-for IN B-PP
-August NNP B-NP
-. . O
-
-The DT B-NP
-August NNP I-NP
-deficit NN I-NP
-and CC O
-the DT B-NP
-# # I-NP
-2.2 CD I-NP
-billion CD I-NP
-gap NN I-NP
-registered VBN B-VP
-in IN B-PP
-July NNP B-NP
-are VBP B-VP
-topped VBN I-VP
-only RB B-ADVP
-by IN B-PP
-the DT B-NP
-# # I-NP
-2.3 CD I-NP
-billion CD I-NP
-deficit NN I-NP
-of IN B-PP
-October NNP B-NP
-1988 CD I-NP
-. . O
-
-Sanjay NNP B-NP
-Joshi NNP I-NP
-, , O
-European JJ B-NP
-economist NN I-NP
-at IN B-PP
-Baring NNP B-NP
-Brothers NNPS I-NP
-& CC I-NP
-Co. NNP I-NP
-, , O
-said VBD B-VP
-there EX B-NP
-is VBZ B-VP
-no DT B-NP
-sign NN I-NP
-that IN B-SBAR
-Britain NNP B-NP
-'s POS B-NP
-manufacturing NN I-NP
-industry NN I-NP
-is VBZ B-VP
-transforming VBG I-VP
-itself PRP B-NP
-to TO B-VP
-boost VB I-VP
-exports NNS B-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-same JJ I-NP
-time NN I-NP
-, , O
-he PRP B-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-pessimistic JJ I-ADJP
-about IN B-PP
-the DT B-NP
-outlook NN I-NP
-for IN B-PP
-imports NNS B-NP
-, , O
-given VBN B-PP
-continued VBD B-NP
-high JJ I-NP
-consumer NN I-NP
-and CC I-NP
-capital NN I-NP
-goods NNS I-NP
-inflows NNS I-NP
-. . O
-
-He PRP B-NP
-reckons VBZ B-VP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-deficit NN I-NP
-will MD B-VP
-narrow VB I-VP
-to TO B-PP
-only RB B-NP
-# # I-NP
-1.8 CD I-NP
-billion CD I-NP
-in IN B-PP
-September NNP B-NP
-. . O
-
-However RB B-ADVP
-, , O
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-he PRP B-NP
-believes VBZ B-VP
-that IN B-SBAR
-a DT B-NP
-reduction NN I-NP
-in IN B-PP
-raw JJ B-NP
-material NN I-NP
-stockbuilding VBG I-NP
-by IN B-PP
-industry NN B-NP
-could MD B-VP
-lead VB I-VP
-to TO B-PP
-a DT B-NP
-sharp JJ I-NP
-drop NN I-NP
-in IN B-PP
-imports NNS B-NP
-. . O
-
-Combined VBN B-PP
-with IN B-PP
-at IN B-ADVP
-least JJS I-ADVP
-some DT B-NP
-rebound NN I-NP
-in IN B-PP
-exports NNS B-NP
-after IN B-PP
-August NNP B-NP
-'s POS B-NP
-unexpected JJ I-NP
-decline NN I-NP
-, , O
-the DT B-NP
-deficit NN I-NP
-could MD B-VP
-narrow VB I-VP
-to TO B-PP
-as RB B-NP
-little JJ I-NP
-as IN I-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-. . O
-
-Mr. NNP B-NP
-Briscoe NNP I-NP
-, , O
-who WP B-NP
-also RB B-ADVP
-forecasts VBZ B-VP
-a DT B-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-current JJ I-NP
-account NN I-NP
-gap NN I-NP
-, , O
-warns VBZ B-VP
-that IN B-SBAR
-even RB B-SBAR
-if IN I-SBAR
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-are VBP B-VP
-bullish JJ B-ADJP
-for IN B-PP
-sterling NN B-NP
-, , O
-the DT B-NP
-currency NN I-NP
-wo MD B-VP
-n't RB I-VP
-advance VB I-VP
-much JJ B-NP
-because IN B-SBAR
-investors NNS B-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-see VB I-VP
-further JJ B-NP
-evidence NN I-NP
-of IN B-PP
-the DT B-NP
-turnaround NN I-NP
-before IN B-PP
-adjusting VBG B-VP
-positions NNS B-NP
-. . O
-
-Nevertheless RB B-ADVP
-, , O
-he PRP B-NP
-noted VBD B-VP
-, , O
-`` `` O
-No DT B-NP
-one PRP I-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-go VB I-VP
-into IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-without IN B-PP
-a DT B-NP
-flat JJ I-NP
-position NN I-NP
-'' '' O
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-. . O
-
-Meanwhile RB B-ADVP
-, , O
-overall JJ B-NP
-evidence NN I-NP
-on IN B-PP
-the DT B-NP
-economy NN I-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-clouded VBN I-ADJP
-. . O
-
-In IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-, , O
-Mr. NNP B-NP
-Lawson NNP I-NP
-warned VBD B-VP
-that IN B-SBAR
-a DT B-NP
-further JJ I-NP
-slowdown NN I-NP
-can MD B-VP
-be VB I-VP
-expected VBN I-VP
-as IN B-SBAR
-the DT B-NP
-impact NN I-NP
-of IN B-PP
-the DT B-NP
-last JJ I-NP
-rise NN I-NP
-in IN B-PP
-interest NN B-NP
-rates NNS I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-takes VBZ B-VP
-effect NN B-NP
-. . O
-
-U.K. JJ B-NP
-base NN I-NP
-rates NNS I-NP
-are VBP B-VP
-at IN B-PP
-their PRP$ B-NP
-highest JJS I-NP
-level NN I-NP
-in IN B-PP
-eight CD B-NP
-years NNS I-NP
-. . O
-
-But CC O
-consumer NN B-NP
-expenditure NN I-NP
-data NNS I-NP
-released VBD B-VP
-Friday NNP B-NP
-do VBP B-VP
-n't RB I-VP
-suggest VB I-VP
-that IN B-SBAR
-the DT B-NP
-U.K. NNP I-NP
-economy NN I-NP
-is VBZ B-VP
-slowing VBG I-VP
-that DT B-ADVP
-quickly RB I-ADVP
-. . O
-
-The DT B-NP
-figures NNS I-NP
-show VBP B-VP
-that DT O
-spending NN B-NP
-rose VBD B-VP
-0.1 CD B-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-third JJ I-NP
-quarter NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-and CC O
-was VBD B-VP
-up IN B-ADVP
-3.8 CD B-NP
-% NN I-NP
-from IN B-PP
-a DT B-NP
-year NN I-NP
-ago RB B-ADVP
-. . O
-
-This DT B-NP
-compares VBZ B-VP
-with IN B-PP
-a DT B-NP
-1.6 CD I-NP
-% NN I-NP
-rise NN I-NP
-in IN B-PP
-the DT B-NP
-second NN I-NP
-from IN B-PP
-the DT B-NP
-first JJ I-NP
-quarter NN I-NP
-and CC O
-a DT B-NP
-5.4 CD I-NP
-% NN I-NP
-increase NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-of IN B-PP
-1988 CD B-NP
-. . O
-
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-the DT B-NP
-data NNS I-NP
-show VBP B-VP
-the DT B-NP
-economy NN I-NP
-`` `` O
-is VBZ B-VP
-still RB B-ADVP
-quite RB B-ADJP
-strong JJ I-ADJP
-, , O
-'' '' O
-but CC O
-suggestions NNS B-NP
-that IN B-SBAR
-much NN B-NP
-of IN B-PP
-the DT B-NP
-spending NN I-NP
-went VBD B-VP
-on IN B-PP
-services NNS B-NP
-rather RB B-PP
-than IN I-PP
-consumer NN B-NP
-goods NNS I-NP
-should MD B-VP
-reduce VB I-VP
-fears NNS B-NP
-of IN B-PP
-more JJR B-NP
-import NN I-NP
-rises NNS I-NP
-. . O
-
-Certainly RB B-ADVP
-, , O
-the DT B-NP
-chancellor NN I-NP
-has VBZ B-VP
-made VBN I-VP
-it PRP B-NP
-clear JJ B-ADJP
-that IN B-SBAR
-he PRP B-NP
-is VBZ B-VP
-prepared VBN I-VP
-to TO I-VP
-increase VB I-VP
-interest NN B-NP
-rates NNS I-NP
-again RB B-ADVP
-if IN B-SBAR
-necessary JJ B-ADJP
-to TO B-VP
-both DT I-VP
-ensure VB I-VP
-that IN B-SBAR
-a DT B-NP
-substantial JJ I-NP
-slowdown NN I-NP
-does VBZ B-VP
-take VB I-VP
-place NN B-NP
-and CC O
-that DT O
-sterling NN B-NP
-does VBZ B-VP
-n't RB I-VP
-decline VB I-VP
-further JJ B-ADVP
-. . O
-
-Thursday NNP B-NP
-, , O
-he PRP B-NP
-reminded VBD B-VP
-his PRP$ B-NP
-audience NN I-NP
-that IN B-SBAR
-the DT B-NP
-government NN I-NP
-`` `` O
-can MD B-VP
-not RB I-VP
-allow VB I-VP
-the DT B-NP
-necessary JJ I-NP
-rigor NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-to TO B-VP
-be VB I-VP
-undermined VBN I-VP
-by IN B-PP
-exchange NN B-NP
-rate NN I-NP
-weakness NN I-NP
-. . O
-'' '' O
-
-Analysts NNS B-NP
-agree VBP B-VP
-there EX B-NP
-is VBZ B-VP
-little JJ B-NP
-holding NN B-VP
-sterling NN B-NP
-firm NN B-ADJP
-at IN B-PP
-the DT B-NP
-moment NN I-NP
-other JJ B-ADJP
-than IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-that IN B-SBAR
-rates NNS B-NP
-will MD B-VP
-be VB I-VP
-pushed VBN I-VP
-higher JJR B-ADJP
-if IN B-SBAR
-necessary JJ B-ADJP
-. . O
-
-And CC O
-, , O
-they PRP B-NP
-warn VBP B-VP
-, , O
-any DT B-NP
-further JJ I-NP
-drop NN I-NP
-in IN B-PP
-the DT B-NP
-government NN I-NP
-'s POS B-NP
-popularity NN I-NP
-could MD B-VP
-swiftly RB I-VP
-make VB I-VP
-this DT B-NP
-promise NN I-NP
-sound NN B-VP
-hollow JJ B-ADJP
-. . O
-
-Sterling NNP B-NP
-was VBD B-VP
-already RB I-VP
-showing VBG I-VP
-some DT B-NP
-signs NNS I-NP
-of IN B-PP
-a DT B-NP
-lack NN I-NP
-of IN B-PP
-confidence NN B-NP
-in IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-Friday NNP B-NP
-. . O
-
-In IN B-PP
-European JJ B-NP
-trading NN I-NP
-it PRP B-NP
-declined VBD B-VP
-to TO B-PP
-$ $ B-NP
-1.5890 CD I-NP
-and CC O
-2.9495 CD B-NP
-marks NNS I-NP
-from IN B-PP
-$ $ B-NP
-1.5940 CD I-NP
-and CC O
-2.9429 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-Economists NNS B-NP
-suggested VBD B-VP
-that IN B-SBAR
-if IN B-SBAR
-the DT B-NP
-pound NN I-NP
-falls VBZ B-VP
-much JJ B-NP
-below IN B-PP
-2.90 CD B-NP
-marks NNS I-NP
-, , O
-the DT B-NP
-government NN I-NP
-will MD B-VP
-be VB I-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-rates NNS B-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-, , O
-both DT B-VP
-to TO I-VP
-halt VB B-VP
-any DT B-NP
-further JJ I-NP
-decline NN I-NP
-and CC O
-ensure VB B-VP
-that IN B-SBAR
-the DT B-NP
-balance NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-remains VBZ B-VP
-unchanged JJ B-ADJP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-Market NNP I-NP
-Activity NN I-NP
-
-The DT B-NP
-dollar NN I-NP
-posted VBD B-VP
-gains NNS B-NP
-in IN B-PP
-quiet JJ B-NP
-trading NN I-NP
-as IN B-SBAR
-concerns NNS B-NP
-about IN B-PP
-equities NNS B-NP
-abated VBN B-VP
-. . O
-
-Foreign JJ B-NP
-exchange NN I-NP
-dealers NNS I-NP
-said VBD B-VP
-that IN B-SBAR
-the DT B-NP
-currency NN I-NP
-market NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-distance VB I-VP
-itself PRP B-NP
-from IN B-PP
-the DT B-NP
-volatile JJ I-NP
-stock NN I-NP
-exchange NN I-NP
-, , O
-which WDT B-NP
-has VBZ B-VP
-preoccupied VBN I-VP
-the DT B-NP
-market NN I-NP
-since IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-, , O
-when WRB B-ADVP
-the DT B-NP
-Dow NNP I-NP
-Jones NNP I-NP
-Industrial NNP I-NP
-Average NNP I-NP
-plunged VBD B-VP
-more JJR B-NP
-than IN I-NP
-190 CD I-NP
-points NNS I-NP
-. . O
-
-Currency NN B-NP
-analysts NNS I-NP
-predict VBP B-VP
-that IN B-SBAR
-in IN B-PP
-the DT B-NP
-coming VBG I-NP
-week NN I-NP
-the DT B-NP
-foreign JJ I-NP
-exchange NN I-NP
-market NN I-NP
-will MD B-VP
-shift VB I-VP
-its PRP$ B-NP
-focus NN I-NP
-back RB B-ADVP
-to TO B-PP
-economic JJ B-NP
-fundamentals NNS I-NP
-, , O
-keeping VBG B-VP
-a DT B-NP
-close NN I-NP
-eye NN I-NP
-out IN B-ADVP
-for IN B-PP
-any DT B-NP
-signs NNS I-NP
-of IN B-PP
-monetary JJ B-NP
-easing NN I-NP
-by IN B-PP
-U.S. NNP B-NP
-Federal NNP I-NP
-Reserve NNP I-NP
-. . O
-
-Late RB B-ADVP
-in IN B-PP
-the DT B-NP
-New NNP I-NP
-York NNP I-NP
-trading NN I-NP
-day NN I-NP
-, , O
-the DT B-NP
-dollar NN I-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-1.8578 CD B-NP
-marks NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-1.8470 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-was VBD B-VP
-also RB I-VP
-changing VBG I-VP
-hands NNS B-NP
-at IN B-PP
-142.43 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-141.70 CD B-NP
-yen NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-In IN B-PP
-Tokyo NNP B-NP
-on IN B-PP
-Monday NNP B-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-opened VBD B-VP
-for IN B-PP
-trading NN B-NP
-at IN B-PP
-141.95 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-Tokyo NNP I-NP
-close NN I-NP
-of IN B-PP
-141.35 CD B-NP
-yen NN I-NP
-. . O
-
-On IN B-PP
-the DT B-NP
-Commodity NNP I-NP
-Exchange NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-, , O
-gold NN B-NP
-for IN B-PP
-current JJ B-NP
-delivery NN I-NP
-settled VBD B-VP
-at IN B-PP
-$ $ B-NP
-367.30 CD I-NP
-an DT B-NP
-ounce NN I-NP
-, , O
-up IN B-ADVP
-20 CD B-NP
-cents NNS I-NP
-. . O
-
-Estimated VBN B-NP
-volume NN I-NP
-was VBD B-VP
-a DT B-NP
-light NN I-NP
-2.4 CD I-NP
-million CD I-NP
-ounces NNS I-NP
-. . O
-
-In IN B-PP
-early JJ B-NP
-trading NN I-NP
-in IN B-PP
-Hong NNP B-NP
-Kong NNP I-NP
-Monday NNP B-NP
-, , O
-gold NN B-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-$ $ B-NP
-366.50 CD I-NP
-an DT B-NP
-ounce NN I-NP
-. . O
-
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-Limited NNP I-NP
-Partnership NNP I-NP
-said VBD B-VP
-it PRP B-NP
-proposed VBD B-VP
-to TO I-VP
-acquire VB I-VP
-A.P. NNP B-NP
-Green NNP I-NP
-Industries NNP I-NP
-Inc. NNP I-NP
-for IN B-PP
-$ $ B-NP
-40 CD I-NP
-a DT B-NP
-share NN I-NP
-. . O
-
-In IN B-PP
-an DT B-NP
-Oct. NNP I-NP
-19 CD I-NP
-letter NN I-NP
-to TO B-PP
-A.P. NNP B-NP
-Green NNP I-NP
-'s POS B-NP
-board NN I-NP
-, , O
-East NNP B-NP
-Rock NNP I-NP
-said VBD B-VP
-the DT B-NP
-offer NN I-NP
-is VBZ B-VP
-subject NN B-ADJP
-to TO B-PP
-the DT B-NP
-signing NN I-NP
-of IN B-PP
-a DT B-NP
-merger NN I-NP
-agreement NN I-NP
-by IN B-PP
-no DT B-ADVP
-later RB I-ADVP
-than IN B-PP
-Oct. NNP B-NP
-31 CD I-NP
-. . O
-
-The DT B-NP
-letter NN I-NP
-, , O
-attached VBN B-VP
-to TO B-PP
-a DT B-NP
-filing NN I-NP
-with IN B-PP
-the DT B-NP
-Securities NNP I-NP
-and CC I-NP
-Exchange NNP I-NP
-Commission NNP I-NP
-, , O
-said VBD B-VP
-the DT B-NP
-approval NN I-NP
-is VBZ B-VP
-also RB B-ADVP
-contingent JJ B-ADJP
-upon IN B-PP
-obtaining VBG B-VP
-satisfactory JJ B-NP
-financing NN I-NP
-. . O
-
-An DT B-NP
-A.P. NNP I-NP
-Green NNP I-NP
-official NN I-NP
-declined VBD B-VP
-to TO I-VP
-comment VB I-VP
-on IN B-PP
-the DT B-NP
-filing NN I-NP
-. . O
-
-The DT B-NP
-$ $ I-NP
-40-a-share JJ I-NP
-proposal NN I-NP
-values VBZ B-VP
-the DT B-NP
-company NN I-NP
-at IN B-PP
-about RB B-NP
-$ $ I-NP
-106.6 CD I-NP
-million CD I-NP
-. . O
-
-A.P. NNP B-NP
-Green NNP I-NP
-currently RB B-ADVP
-has VBZ B-VP
-2,664,098 CD B-NP
-shares NNS I-NP
-outstanding JJ B-ADJP
-. . O
-
-Its PRP$ B-NP
-stock NN I-NP
-closed VBD B-VP
-at IN B-PP
-$ $ B-NP
-38 CD I-NP
-, , O
-up IN B-ADVP
-$ $ B-NP
-1.875 CD I-NP
-, , O
-in IN B-PP
-national JJ B-NP
-over-the-counter JJ I-NP
-trading NN I-NP
-. . O
-
-The DT B-NP
-company NN I-NP
-is VBZ B-VP
-a DT B-NP
-Mexico NNP I-NP
-, , I-NP
-Mo. NNP I-NP
-, , I-NP
-maker NN I-NP
-of IN B-PP
-refractory JJ B-NP
-products NNS I-NP
-. . O
-
-East NNP B-NP
-Rock NNP I-NP
-also RB B-ADVP
-said VBD B-VP
-in IN B-PP
-the DT B-NP
-filing NN I-NP
-that IN B-SBAR
-it PRP B-NP
-boosted VBD B-VP
-its PRP$ B-NP
-stake NN I-NP
-in IN B-PP
-A.P. NNP B-NP
-Green NNP I-NP
-to TO B-PP
-8.7 CD B-NP
-% NN I-NP
-. . O
-
-It PRP B-NP
-now RB B-ADVP
-holds VBZ B-VP
-233,000 CD B-NP
-A.P. NNP I-NP
-Green NNP I-NP
-common JJ I-NP
-shares NNS I-NP
-, , O
-including VBG B-PP
-30,000 CD B-NP
-shares NNS I-NP
-bought VBD B-VP
-last JJ B-NP
-Thursday NNP I-NP
-for IN B-PP
-$ $ B-NP
-35.50 CD I-NP
-to TO I-NP
-$ $ I-NP
-36.50 CD I-NP
-a DT B-NP
-share NN I-NP
-. . O
-
-New NNP B-NP
-York-based JJ I-NP
-John NNP I-NP
-Kuhns NNP I-NP
-and CC I-NP
-Robert NNP I-NP
-MacDonald NNP I-NP
-control NN B-VP
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-Inc. NNP I-NP
-, , O
-the DT B-NP
-sole JJ I-NP
-general JJ I-NP
-partner NN I-NP
-of IN B-PP
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-L.P NNP I-NP
-. . O
-
-The DT B-NP
-sole JJ I-NP
-limited JJ I-NP
-partner NN I-NP
-of IN B-PP
-the DT B-NP
-partnership NN I-NP
-is VBZ B-VP
-Westwood NNP B-NP
-Brick NNP I-NP
-Lime NNP I-NP
-Inc. NNP I-NP
-, , O
-an DT B-NP
-indirect JJ I-NP
-subsidiary NN I-NP
-of IN B-PP
-Westwood NNP B-NP
-Group NNP I-NP
-Inc NNP I-NP
-. . O
-
-Both DT B-NP
-Westwood NNP B-NP
-Brick NNP I-NP
-and CC O
-Westwood NNP B-NP
-Group NNP I-NP
-are VBP B-VP
-based VBN I-VP
-in IN B-PP
-Boston NNP B-NP
-. . O
-
-Freight NN B-NP
-rates NNS I-NP
-, , O
-declining VBG B-VP
-for IN B-PP
-most RBS B-NP
-of IN B-PP
-the DT B-NP
-decade NN I-NP
-because IN B-PP
-of IN I-PP
-competition NN B-NP
-spurred VBN B-VP
-by IN B-PP
-deregulation NN B-NP
-, , O
-are VBP B-VP
-bottoming VBG I-VP
-out IN B-PRT
-, , O
-turning VBG B-VP
-upward RB B-ADVP
-and CC O
-threatening VBG B-VP
-to TO I-VP
-fuel VB I-VP
-inflation NN B-NP
-. . O
-
-Trucking NNP B-NP
-, , I-NP
-shipping VBG I-NP
-and CC I-NP
-air-freight NN I-NP
-companies NNS I-NP
-have VBP B-VP
-announced VBN I-VP
-rate NN B-NP
-increases NNS I-NP
-, , O
-scheduled VBN B-VP
-for IN B-PP
-this DT B-NP
-fall NN I-NP
-or CC O
-early JJ B-NP
-next JJ I-NP
-year NN I-NP
-, , O
-reflecting VBG B-VP
-higher JJR B-NP
-costs NNS I-NP
-and CC O
-tightened VBD B-NP
-demand NN I-NP
-for IN B-PP
-freight NN B-NP
-transport NN I-NP
-. . O
-
-Major JJ B-NP
-shippers NNS I-NP
-say VBP B-VP
-they PRP B-NP
-expect VBP B-VP
-freight NN B-NP
-rates NNS I-NP
-to TO B-VP
-rise VB I-VP
-at IN B-ADVP
-least JJS I-ADVP
-as RB B-ADVP
-fast RB I-ADVP
-as IN B-PP
-inflation NN B-NP
-and CC B-ADVP
-maybe RB I-ADVP
-faster RBR B-ADVP
-in IN B-PP
-the DT B-NP
-next JJ I-NP
-few JJ I-NP
-years NNS I-NP
-. . O
-
-That DT B-NP
-'s VBZ B-VP
-a DT B-NP
-big JJ I-NP
-change NN I-NP
-from IN B-PP
-recent JJ B-NP
-years NNS I-NP
-when WRB B-ADVP
-freight NN B-NP
-haulage NN I-NP
-was VBD B-VP
-a DT B-NP
-bright JJ I-NP
-spot NN I-NP
-for IN B-PP
-U.S. NNP B-NP
-productivity NN I-NP
-, , O
-helping VBG B-VP
-to TO I-VP
-restrain VB I-VP
-inflation NN B-NP
-and CC O
-make VB B-VP
-U.S. NNP B-NP
-industry NN I-NP
-more RBR B-ADJP
-competitive JJ I-ADJP
-abroad RB B-ADVP
-. . O
-
-`` `` O
-Demand NN B-NP
-has VBZ B-VP
-caught VBN I-VP
-up IN B-PRT
-with IN B-PP
-the DT B-NP
-supply NN I-NP
-of IN B-PP
-certain JJ B-NP
-types NNS I-NP
-of IN B-PP
-freight NN B-NP
-transportation NN I-NP
-, , O
-and CC O
-rates NNS B-NP
-are VBP B-VP
-starting VBG I-VP
-to TO I-VP
-move VB I-VP
-up IN B-ADVP
-'' '' O
-at IN B-PP
-a DT B-NP
-rate NN I-NP
-`` `` O
-close RB B-ADJP
-to TO B-PP
-or CC O
-slightly RB B-ADJP
-more JJR I-ADJP
-than IN B-PP
-the DT B-NP
-inflation NN I-NP
-rate NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Clifford NNP B-NP
-Sayre NNP I-NP
-, , O
-director NN B-NP
-of IN B-PP
-logistics NNS B-NP
-at IN B-PP
-Du NNP B-NP
-Pont NNP I-NP
-Co NNP I-NP
-. . O
-
-Shippers NNS B-NP
-surveyed VBN B-VP
-recently RB B-ADVP
-by IN B-PP
-Ohio NNP B-NP
-State NNP I-NP
-University NNP I-NP
-said VBD B-VP
-they PRP B-NP
-expect VBP B-VP
-their PRP$ B-NP
-freight-transport JJ I-NP
-, , I-NP
-storage NN I-NP
-and CC I-NP
-distribution NN I-NP
-costs NNS I-NP
-to TO B-VP
-rise VB I-VP
-about IN B-NP
-4 CD I-NP
-% NN I-NP
-this DT B-NP
-year NN I-NP
-. . O
-
-Only RB B-NP
-10 CD I-NP
-% NN I-NP
-of IN B-PP
-the DT B-NP
-250 CD I-NP
-shippers NNS I-NP
-polled VBN B-VP
-expected VBN B-VP
-their PRP$ B-NP
-freight-transport JJ I-NP
-costs NNS I-NP
-to TO B-VP
-decrease VB I-VP
-, , O
-compared VBN B-PP
-with IN B-PP
-30 CD B-NP
-% NN I-NP
-who WP B-NP
-had VBD B-VP
-looked VBN I-VP
-to TO B-PP
-freight VB B-NP
-transport NN I-NP
-to TO B-VP
-reduce VB I-VP
-costs NNS B-NP
-in IN B-PP
-past JJ B-NP
-years NNS I-NP
-. . O
-
-`` `` O
-This DT B-NP
-is VBZ B-VP
-the DT B-NP
-first JJ I-NP
-year NN I-NP
-since IN B-PP
-transportation NN B-NP
-deregulation NN I-NP
-in IN B-PP
-1980 CD B-NP
-that IN B-ADVP
-we PRP B-NP
-have VBP B-VP
-had VBN I-VP
-such JJ B-NP
-a DT I-NP
-dramatic JJ I-NP
-and CC I-NP
-broad-based JJ I-NP
-upturn NN I-NP
-in IN B-PP
-perceived VBN B-NP
-transportation NN I-NP
-rates NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Bernard NNP B-NP
-LaLonde NNP I-NP
-, , O
-a DT B-NP
-transportation NN I-NP
-logistics NNS I-NP
-professor NN I-NP
-at IN B-PP
-Ohio NNP B-NP
-State NNP I-NP
-in IN B-PP
-Columbus NNP B-NP
-. . O
-
-The DT B-NP
-deregulation NN I-NP
-of IN B-PP
-railroads NNS B-NP
-and CC I-NP
-trucking NN I-NP
-companies NNS I-NP
-that WDT B-NP
-began VBD B-VP
-in IN B-PP
-1980 CD B-NP
-enabled VBD B-VP
-shippers NNS B-NP
-to TO B-VP
-bargain VB I-VP
-for IN B-PP
-transportation NN B-NP
-. . O
-
-Carriers NNP B-NP
-could MD B-VP
-use VB I-VP
-their PRP$ B-NP
-equipment NN I-NP
-more RBR B-ADVP
-efficiently RB I-ADVP
-, , O
-leading VBG B-VP
-to TO B-PP
-overcapacity NN B-NP
-they PRP B-NP
-were VBD B-VP
-eager JJ B-ADJP
-to TO B-VP
-fill VB I-VP
-. . O
-
-Shippers NNS B-NP
-cut VBP B-VP
-about RB B-NP
-$ $ I-NP
-35 CD I-NP
-billion CD I-NP
-from IN B-PP
-their PRP$ B-NP
-annual JJ I-NP
-, , I-NP
-inter-city JJ I-NP
-truck NN I-NP
-and CC I-NP
-rail NN I-NP
-costs NNS I-NP
-, , O
-to TO B-PP
-about RB B-NP
-$ $ I-NP
-150 CD I-NP
-billion CD I-NP
-, , O
-or CC O
-about IN B-NP
-6.4 CD I-NP
-% NN I-NP
-of IN B-PP
-gross JJ B-NP
-national JJ I-NP
-product NN I-NP
-, , O
-down RB B-ADVP
-from IN B-PP
-8 CD B-NP
-% NN I-NP
-of IN B-PP
-GNP NNP B-NP
-in IN B-PP
-1981 CD B-NP
-. . O
-
-But CC O
-with IN B-PP
-much NN B-NP
-of IN B-PP
-the DT B-NP
-inefficiency NN I-NP
-squeezed VBN B-VP
-out IN B-PP
-of IN B-PP
-the DT B-NP
-freight-transport JJ I-NP
-system NN I-NP
-, , O
-rising VBG B-NP
-costs NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-reflected VBN I-VP
-directly RB B-ADVP
-in IN B-PP
-higher JJR B-NP
-freight NN I-NP
-rates NNS I-NP
-. . O
-
-`` `` O
-Shippers NNS B-NP
-are VBP B-VP
-saying VBG I-VP
-` `` O
-the DT B-NP
-party NN I-NP
-'s POS B-VP
-over IN B-ADJP
-, , O
-' '' O
-'' '' O
-said VBD B-VP
-Mr. NNP B-NP
-LaLonde NNP I-NP
-. . O
-
-`` `` O
-Shippers NNS B-NP
-wo MD B-VP
-n't RB I-VP
-be VB I-VP
-able JJ B-ADJP
-to TO B-VP
-look VB I-VP
-for IN B-PP
-transportation-cost JJ B-NP
-savings NNS I-NP
-as IN B-SBAR
-they PRP B-NP
-have VBP B-VP
-for IN B-PP
-the DT B-NP
-last JJ I-NP
-eight CD I-NP
-or CC I-NP
-nine CD I-NP
-years NNS I-NP
-. . O
-
-Transport NN B-NP
-rates NNS I-NP
-wo MD B-VP
-n't RB I-VP
-be VB I-VP
-an DT B-NP
-opportunity NN I-NP
-for IN B-PP
-offsetting VBG B-VP
-cost NN B-NP
-increases NNS I-NP
-in IN B-PP
-other JJ B-NP
-segments NNS I-NP
-of IN B-PP
-the DT B-NP
-economy NN I-NP
-. . O
-'' '' O
-
-Robert NNP B-NP
-Delaney NNP I-NP
-, , O
-a DT B-NP
-consultant NN I-NP
-at IN B-PP
-Arthur NNP B-NP
-D. NNP I-NP
-Little NNP I-NP
-Inc. NNP I-NP
-, , O
-Cambridge NNP B-NP
-, , O
-Mass. NNP B-NP
-, , O
-said VBD B-VP
-`` `` O
-We PRP B-NP
-'ve VBP B-VP
-gotten VBN I-VP
-all PDT B-NP
-the DT I-NP
-benefits NNS I-NP
-of IN B-PP
-deregulation NN B-NP
-in IN B-PP
-freight-cost JJ B-NP
-reductions NNS I-NP
-. . O
-
-Now RB B-ADVP
-we PRP B-NP
-are VBP B-VP
-starting VBG I-VP
-to TO I-VP
-see VB I-VP
-real JJ B-NP
-freight-rate JJ I-NP
-increases NNS I-NP
-as IN B-SBAR
-carriers NNS B-NP
-replace VBP B-VP
-equipment NN B-NP
-, , O
-pay VB B-VP
-higher JJR B-NP
-fuel NN I-NP
-costs NNS I-NP
-and CC O
-pay VB B-VP
-more JJR B-NP
-for IN B-PP
-labor NN B-NP
-. . O
-
-You PRP B-NP
-'ll MD B-VP
-see VB I-VP
-carriers NNS B-NP
-try VB B-VP
-to TO I-VP
-recoup VB I-VP
-some DT B-NP
-of IN B-PP
-the DT B-NP
-price NN I-NP
-cutting VBG I-NP
-that WDT B-NP
-occurred VBD B-VP
-previously RB B-ADVP
-. . O
-'' '' O
-
-Not RB B-NP
-everyone NN I-NP
-believes VBZ B-VP
-that IN B-SBAR
-the DT B-NP
-good JJ I-NP
-times NNS I-NP
-are VBP B-VP
-over IN B-ADJP
-for IN B-PP
-shippers NNS B-NP
-. . O
-
-`` `` O
-There EX B-NP
-'s VBZ B-VP
-still RB B-ADVP
-a DT B-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-on IN B-PP
-rates NNS B-NP
-in IN B-PP
-both DT B-NP
-rail NN I-NP
-and CC I-NP
-truck NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Gerard NNP B-NP
-McCullough NNP I-NP
-, , O
-lecturer NN B-NP
-in IN B-PP
-transportation NN B-NP
-at IN B-PP
-Massachusetts NNP B-NP
-Institute NNP I-NP
-of IN B-PP
-Technology NNP B-NP
-. . O
-
-Less-than-truckload JJ B-NP
-companies NNS I-NP
-, , O
-which WDT B-NP
-carry VBP B-VP
-the DT B-NP
-freight NN I-NP
-of IN B-PP
-several JJ B-NP
-shippers NNS I-NP
-in IN B-PP
-each DT B-NP
-truck NN I-NP
-trailer NN I-NP
-, , O
-discounted VBD B-VP
-away RB B-ADVP
-a DT B-NP
-4.7 CD I-NP
-% NN I-NP
-rate NN I-NP
-increase NN I-NP
-implemented VBD B-VP
-last JJ B-NP
-April NNP I-NP
-. . O
-
-The DT B-NP
-carriers NNS I-NP
-were VBD B-VP
-competing VBG I-VP
-fiercely RB B-ADVP
-for IN B-PP
-market NN B-NP
-share NN I-NP
-. . O
-
-Railroad-rate JJ B-NP
-increases NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-restrained VBN I-VP
-by IN B-PP
-weakening VBG B-NP
-rail-traffic JJ I-NP
-levels NNS I-NP
-and CC O
-keen JJ B-NP
-competition NN I-NP
-for IN B-PP
-freight NN B-NP
-from IN B-PP
-trucks NNS B-NP
-. . O
-
-An DT B-NP
-official NN I-NP
-at IN B-PP
-Consolidated NNP B-NP
-Freightways NNP I-NP
-Inc. NNP I-NP
-, , O
-a DT B-NP
-Menlo NNP I-NP
-Park NNP I-NP
-, , I-NP
-Calif. NNP I-NP
-, , I-NP
-less-than-truckload JJ I-NP
-carrier NN I-NP
-, , O
-said VBD B-VP
-rate NN B-NP
-discounting NN I-NP
-in IN B-PP
-that DT B-NP
-industry NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-`` `` O
-stabilize VB B-VP
-. . O
-'' '' O
-
-Consolidated NNP B-NP
-Freightways NNP I-NP
-plans VBZ B-VP
-to TO I-VP
-raise VB I-VP
-its PRP$ B-NP
-rates NNS I-NP
-5.3 CD B-NP
-% NN I-NP
-late JJ B-NP
-this DT I-NP
-year NN I-NP
-or CC O
-early JJ B-NP
-next JJ I-NP
-year NN I-NP
-, , O
-and CC O
-at IN B-NP
-least JJS I-NP
-two CD I-NP
-competitors NNS I-NP
-have VBP B-VP
-announced VBN I-VP
-similar JJ B-NP
-increases NNS I-NP
-. . O
-
-Truckers NNS B-NP
-are VBP B-VP
-`` `` O
-trying VBG B-VP
-to TO I-VP
-send VB I-VP
-signals NNS B-NP
-that IN B-SBAR
-they PRP B-NP
-need VBP B-VP
-to TO I-VP
-stop VB I-VP
-the DT B-NP
-bloodletting NN I-NP
-, , O
-forget VB B-VP
-about IN B-PP
-market NN B-NP
-share NN I-NP
-and CC O
-go VB B-VP
-for IN B-PP
-higher JJR B-NP
-rates NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Michael NNP B-NP
-Lloyd NNP I-NP
-, , O
-an DT B-NP
-analyst NN I-NP
-at IN B-PP
-Salomon NNP B-NP
-Bros NNP I-NP
-. . O
-
-And CC O
-`` `` O
-shippers NNS B-NP
-are VBP B-VP
-getting VBG I-VP
-the DT B-NP
-feeling NN I-NP
-that IN B-SBAR
-they PRP B-NP
-have VBP B-VP
-played VBN I-VP
-one CD B-NP
-trucker NN I-NP
-off IN B-ADVP
-against IN B-PP
-another DT B-NP
-as RB B-NP
-much JJ I-NP
-as IN B-SBAR
-they PRP B-NP
-can MD B-VP
-, , O
-'' '' O
-he PRP B-NP
-said VBD B-VP
-. . O
-
-Air-freight NN B-NP
-carriers NNS I-NP
-raised VBD B-VP
-their PRP$ B-NP
-rates NNS I-NP
-for IN B-PP
-U.S. NNP B-NP
-products NNS I-NP
-going VBG B-VP
-across IN B-PP
-the DT B-NP
-Pacific NNP I-NP
-to TO B-PP
-Asia NNP B-NP
-by IN B-PP
-about IN B-NP
-20 CD I-NP
-% NN I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-. . O
-
-And CC O
-Japan NNP B-NP
-Air NNP I-NP
-Lines NNPS I-NP
-said VBD B-VP
-it PRP B-NP
-plans VBZ B-VP
-to TO I-VP
-boost VB I-VP
-its PRP$ B-NP
-rates NNS I-NP
-a DT B-NP
-further JJ I-NP
-25 CD I-NP
-% NN I-NP
-over IN B-PP
-the DT B-NP
-next JJ I-NP
-two CD I-NP
-years NNS I-NP
-. . O
-
-Such JJ B-NP
-rate NN I-NP
-increases NNS I-NP
-`` `` O
-will MD B-VP
-increase VB I-VP
-the DT B-NP
-total JJ I-NP
-cost NN I-NP
-of IN B-PP
-U.S. NNP B-NP
-products NNS I-NP
-and CC O
-slow JJ B-VP
-down RP B-PRT
-the DT B-NP
-rate NN I-NP
-of IN B-PP
-increase NN B-NP
-of IN B-PP
-U.S. NNP B-NP
-exports NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Richard NNP B-NP
-Connors NNP I-NP
-, , O
-a DT B-NP
-senior JJ I-NP
-vice NN I-NP
-president NN I-NP
-of IN B-PP
-Yusen NNP B-NP
-Air NNP I-NP
-& CC I-NP
-Sea NNP I-NP
-Service NNP I-NP
-U.S.A. NNP I-NP
-Inc. NNP I-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-air-freight-forwarding JJ I-NP
-subsidiary NN I-NP
-of IN B-PP
-Nippon NNP B-NP
-Yusen NNP I-NP
-Kaisha NNP I-NP
-of IN B-PP
-Japan NNP B-NP
-. . O
-
-Ship NN B-NP
-companies NNS I-NP
-carrying VBG B-VP
-bulk NN B-NP
-commodities NNS I-NP
-, , O
-such JJ B-PP
-as IN I-PP
-oil NN B-NP
-, , O
-grain NN B-NP
-, , O
-coal NN B-NP
-and CC O
-iron NN B-NP
-ore NN I-NP
-, , O
-have VBP B-VP
-been VBN I-VP
-able JJ B-ADJP
-to TO B-VP
-increase VB I-VP
-their PRP$ B-NP
-rates NNS I-NP
-in IN B-PP
-the DT B-NP
-last JJ I-NP
-couple NN I-NP
-of IN B-PP
-years NNS B-NP
-. . O
-
-Some DT B-NP
-bulk NN I-NP
-shipping VBG I-NP
-rates NNS I-NP
-have VBP B-VP
-increased VBN I-VP
-`` `` O
-3 CD B-NP
-% NN I-NP
-to TO I-NP
-4 CD I-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-past JJ I-NP
-few JJ I-NP
-months NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Salomon NNP B-NP
-'s POS B-NP
-Mr. NNP I-NP
-Lloyd NNP I-NP
-. . O
-
-And CC O
-ship NN B-NP
-lines NNS I-NP
-carrying VBG B-VP
-containers NNS B-NP
-are VBP B-VP
-also RB I-VP
-trying VBG I-VP
-to TO I-VP
-raise VB I-VP
-their PRP$ B-NP
-rates NNS I-NP
-. . O
-
-Carriers NNP B-NP
-boosted VBD B-VP
-rates NNS B-NP
-more JJR B-NP
-than IN I-NP
-10 CD I-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-North NNP I-NP
-Atlantic NNP I-NP
-between IN B-PP
-the DT B-NP
-U.S. NNP I-NP
-and CC O
-Europe NNP B-NP
-last JJ B-NP
-September NNP I-NP
-, , O
-hoping VBG B-VP
-to TO I-VP
-partly RB I-VP
-restore VB I-VP
-rates NNS B-NP
-to TO B-PP
-earlier JJR B-NP
-levels NNS I-NP
-. . O
-
-Ship NN B-NP
-lines NNS I-NP
-operating VBG B-VP
-in IN B-PP
-the DT B-NP
-Pacific NNP I-NP
-plan NN B-VP
-to TO I-VP
-raise VB I-VP
-rates NNS B-NP
-on IN B-PP
-containers NNS B-NP
-carrying VBG B-VP
-U.S. NNP B-NP
-exports NNS I-NP
-to TO B-PP
-Asia NNP B-NP
-about IN B-NP
-10 CD I-NP
-% NN I-NP
-, , O
-effective JJ B-ADJP
-next JJ B-NP
-April NNP I-NP
-. . O
-
-MGM NNP B-NP
-Grand NNP I-NP
-Inc. NNP I-NP
-said VBD B-VP
-it PRP B-NP
-filed VBD B-VP
-a DT B-NP
-registration NN I-NP
-statement NN I-NP
-with IN B-PP
-the DT B-NP
-Securities NNP I-NP
-and CC I-NP
-Exchange NNP I-NP
-Commission NNP I-NP
-for IN B-PP
-a DT B-NP
-public JJ I-NP
-offering NN I-NP
-of IN B-PP
-six CD B-NP
-million CD I-NP
-common JJ I-NP
-shares NNS I-NP
-. . O
-
-The DT B-NP
-Beverly NNP I-NP
-Hills NNP I-NP
-, , I-NP
-Calif.-based JJ I-NP
-company NN I-NP
-said VBD B-VP
-it PRP B-NP
-would MD B-VP
-have VB I-VP
-26.9 CD B-NP
-million CD I-NP
-common JJ I-NP
-shares NNS I-NP
-outstanding JJ B-ADJP
-after IN B-PP
-the DT B-NP
-offering NN I-NP
-. . O
-
-The DT B-NP
-hotel NN I-NP
-and CC I-NP
-Gaming NNP I-NP
-company NN I-NP
-said VBD B-VP
-Merrill NNP B-NP
-Lynch NNP I-NP
-Capital NNP I-NP
-Markets NNPS I-NP
-will MD B-VP
-lead VB I-VP
-the DT B-NP
-underwriters NNS I-NP
-. . O
-
-Proceeds NNS B-NP
-from IN B-PP
-the DT B-NP
-sale NN I-NP
-will MD B-VP
-be VB I-VP
-used VBN I-VP
-for IN B-PP
-remodeling VBG B-NP
-and CC I-NP
-refurbishing VBG I-NP
-projects NNS I-NP
-, , B-PP
-as RB I-PP
-well RB I-PP
-as IN I-PP
-for IN B-PP
-the DT B-NP
-planned VBN I-NP
-MGM NNP I-NP
-Grand NNP I-NP
-hotel\/casino NN I-NP
-and CC I-NP
-theme NN I-NP
-park NN I-NP
-. . O
-
-Bob NNP B-NP
-Stone NNP I-NP
-stewed JJ B-VP
-over IN B-PP
-a DT B-NP
-letter NN I-NP
-from IN B-PP
-his PRP$ B-NP
-manager NN I-NP
-putting VBG B-VP
-him PRP B-NP
-on IN B-PP
-probation NN B-NP
-for IN B-PP
-insubordination NN B-NP
-. . O
-
-Mr. NNP B-NP
-Stone NNP I-NP
-thought VBD B-VP
-the DT B-NP
-discipline NN I-NP
-was VBD B-VP
-unfair JJ B-ADJP
-; : O
-he PRP B-NP
-believed VBD B-VP
-that IN B-SBAR
-his PRP$ B-NP
-manager NN I-NP
-wanted VBD B-VP
-to TO I-VP
-get VB I-VP
-rid JJ B-ADJP
-of IN B-PP
-him PRP B-NP
-for IN B-PP
-personal JJ B-NP
-reasons NNS I-NP
-. . O
-
-Unable JJ B-ADJP
-to TO B-VP
-persuade VB I-VP
-the DT B-NP
-manager NN I-NP
-to TO B-VP
-change VB I-VP
-his PRP$ B-NP
-decision NN I-NP
-, , O
-he PRP B-NP
-went VBD B-VP
-to TO B-PP
-a DT B-NP
-`` `` I-NP
-company NN I-NP
-court NN I-NP
-'' '' O
-for IN B-PP
-a DT B-NP
-hearing NN I-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-scheduled VBN I-NP
-time NN I-NP
-, , O
-Mr. NNP B-NP
-Stone NNP I-NP
-entered VBD B-VP
-a DT B-NP
-conference NN I-NP
-room NN I-NP
-in IN B-PP
-a DT B-NP
-building NN I-NP
-near IN B-PP
-where WRB B-ADVP
-he PRP B-NP
-worked VBD B-VP
-. . O
-
-After IN B-SBAR
-the DT B-NP
-three CD I-NP
-members NNS I-NP
-of IN B-PP
-the DT B-NP
-court NN I-NP
-introduced VBD B-VP
-themselves PRP B-NP
-, , O
-the DT B-NP
-chairman NN I-NP
-of IN B-PP
-the DT B-NP
-panel NN I-NP
-said VBD B-VP
-: : O
-`` `` O
-Go VB B-VP
-ahead RB B-ADVP
-and CC O
-tell VB B-VP
-us PRP B-NP
-what WP B-NP
-happened VBD B-VP
-. . O
-
-We PRP B-NP
-may MD B-VP
-ask VB I-VP
-questions NNS B-NP
-as IN B-SBAR
-you PRP B-NP
-go VBP B-VP
-along IN B-PRT
-, , O
-or CC O
-we PRP B-NP
-may MD B-VP
-wait VB I-VP
-until IN B-PP
-the DT B-NP
-end NN I-NP
-. . O
-'' '' O
-
-No DT B-NP
-lawyers NNS I-NP
-or CC I-NP
-tape NN I-NP
-recorders NNS I-NP
-were VBD B-VP
-present JJ B-ADJP
-. . O
-
-The DT B-NP
-only RB I-NP
-extra JJ I-NP
-people NNS I-NP
-were VBD B-VP
-a DT B-NP
-couple NN I-NP
-of IN B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-, , O
-one CD B-NP
-of IN B-PP
-whom WP B-NP
-knew VBD B-VP
-Mr. NNP B-NP
-Stone NNP I-NP
-'s POS B-NP
-case NN I-NP
-intimately RB B-ADVP
-and CC O
-would MD B-VP
-help VB I-VP
-fill VB I-VP
-in IN B-PRT
-any DT B-NP
-facts NNS I-NP
-needed VBN B-VP
-to TO B-VP
-give VB I-VP
-the DT B-NP
-court NN I-NP
-the DT B-NP
-full JJ I-NP
-picture NN I-NP
-. . O
-
-Over IN B-PP
-a DT B-NP
-cup NN I-NP
-of IN B-PP
-coffee NN B-NP
-, , O
-Mr. NNP B-NP
-Stone NNP I-NP
-told VBD B-VP
-his PRP$ B-NP
-story NN I-NP
-. . O
-
-He PRP B-NP
-talked VBD B-VP
-about IN B-NP
-20 CD I-NP
-minutes NNS I-NP
-. . O
-
-When WRB B-ADVP
-he PRP B-NP
-was VBD B-VP
-through IN B-ADJP
-, , O
-the DT B-NP
-court NN I-NP
-members NNS I-NP
-asked VBD B-VP
-many JJ B-NP
-questions NNS I-NP
-, , O
-then RB B-ADVP
-the DT B-NP
-chairman NN I-NP
-said VBD B-VP
-they PRP B-NP
-would MD B-VP
-like VB I-VP
-to TO I-VP
-hear VB I-VP
-his PRP$ B-NP
-manager NN I-NP
-'s POS B-NP
-side NN I-NP
-and CC O
-talk VB B-VP
-to TO B-PP
-witnesses NNS B-NP
-. . O
-
-The DT B-NP
-chairman NN I-NP
-promised VBD B-VP
-Mr. NNP B-NP
-Stone NNP I-NP
-a DT B-NP
-decision NN I-NP
-within IN B-PP
-two CD B-NP
-weeks NNS I-NP
-. . O
-
-Bob NNP B-NP
-Stone NNP I-NP
-is VBZ B-VP
-a DT B-NP
-fictional JJ I-NP
-name NN I-NP
-, , O
-but CC O
-the DT B-NP
-incident NN I-NP
-described VBN B-VP
-is VBZ B-VP
-real JJ B-ADJP
-. . O
-
-It PRP B-NP
-happened VBD B-VP
-at IN B-PP
-Northrop NNP B-NP
-Corp. NNP I-NP
-in IN B-PP
-Los NNP B-NP
-Angeles NNP I-NP
-. . O
-
-The DT B-NP
-court NN I-NP
-is VBZ B-VP
-called VBN I-VP
-the DT B-NP
-Management NNP I-NP
-Appeals NNP I-NP
-Committee NNP I-NP
-, , O
-or CC O
-just RB B-NP
-`` `` I-NP
-MAC NNP I-NP
-, , O
-'' '' O
-and CC O
-it PRP B-NP
-is VBZ B-VP
-likely JJ B-ADJP
-to TO B-VP
-hear VB I-VP
-a DT B-NP
-couple NN I-NP
-of IN I-NP
-dozen NN I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-. . O
-
-Alter VB B-VP
-some DT B-NP
-details NNS I-NP
-of IN B-PP
-this DT B-NP
-example NN I-NP
-and CC O
-it PRP B-NP
-could MD B-VP
-be VB I-VP
-taking VBG I-VP
-place NN B-NP
-today NN B-ADVP
-at IN B-PP
-Federal NNP B-NP
-Express NNP I-NP
-in IN B-PP
-Memphis NNP B-NP
-, , O
-the DT B-NP
-Defense NNP I-NP
-and CC I-NP
-Underseas NNP I-NP
-Systems NNP I-NP
-divisions NNS I-NP
-of IN B-PP
-Honeywell NNP B-NP
-in IN B-PP
-Minneapolis NNP B-NP
-, , O
-a DT B-NP
-General NNP I-NP
-Electric NNP I-NP
-plant NN I-NP
-in IN B-PP
-Columbia NNP B-NP
-, , O
-Md. NNP B-NP
-, , O
-or CC O
-a DT B-NP
-number NN I-NP
-of IN B-PP
-other JJ B-NP
-companies NNS I-NP
-. . O
-
-These DT B-NP
-firms NNS I-NP
-are VBP B-VP
-pioneers NNS B-NP
-in IN B-PP
-a DT B-NP
-significant JJ I-NP
-new JJ I-NP
-trend NN I-NP
-in IN B-PP
-the DT B-NP
-corporate JJ I-NP
-world NN I-NP
-: : O
-the DT B-NP
-rise NN I-NP
-of IN B-PP
-what WP B-NP
-I PRP B-NP
-call VBP B-VP
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-. . O
-
-Although IN B-SBAR
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-is VBZ B-VP
-practiced VBN I-VP
-today NN B-NP
-in IN B-PP
-few JJ B-NP
-companies NNS I-NP
--- : O
-perhaps RB B-ADVP
-40 CD B-NP
-to TO I-NP
-60 CD I-NP
--- : O
-it PRP B-NP
-is VBZ B-VP
-one CD B-NP
-of IN B-PP
-the DT B-NP
-fastest JJS I-NP
-developing VBG I-NP
-trends NNS I-NP
-in IN B-PP
-industry NN B-NP
-. . O
-
-In IN B-PP
-the DT B-NP
-coming VBG I-NP
-decade NN I-NP
-a DT B-NP
-majority NN I-NP
-of IN B-PP
-people-oriented JJ B-NP
-companies NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-adopt VB I-VP
-it PRP B-NP
-. . O
-
-Corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-appeals NNS B-VP
-to TO B-PP
-management NN B-NP
-for IN B-PP
-a DT B-NP
-variety NN I-NP
-of IN B-PP
-reasons NNS B-NP
-. . O
-
-It PRP B-NP
-reduces VBZ B-VP
-lawsuits NNS B-NP
-from IN B-PP
-disgruntled JJ B-NP
-employees NNS I-NP
-and CC I-NP
-ex-employees NNS I-NP
-, , O
-with IN B-PP
-all DT B-NP
-that WDT B-NP
-means VBZ B-VP
-for IN B-PP
-reduced VBN B-NP
-legal JJ I-NP
-costs NNS I-NP
-and CC O
-better RBR B-NP
-public JJ I-NP
-relations NNS I-NP
-. . O
-
-It PRP B-NP
-helps VBZ B-VP
-to TO I-VP
-keep VB I-VP
-out IN B-PRT
-unions NNS B-NP
-. . O
-
-It PRP B-NP
-increases VBZ B-VP
-employee NN B-NP
-commitment NN I-NP
-to TO B-PP
-the DT B-NP
-company NN I-NP
-, , O
-with IN B-PP
-all DT B-NP
-that WDT B-NP
-means VBZ B-VP
-for IN B-PP
-efficiency NN B-NP
-and CC O
-quality NN B-NP
-control NN I-NP
-. . O
-
-What WP B-NP
-must MD O
-your PRP$ B-NP
-management NN I-NP
-team NN I-NP
-do VBP B-VP
-to TO B-VP
-establish VB I-VP
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-? . O
-
-Here RB B-ADVP
-are VBP B-VP
-four CD B-NP
-key JJ I-NP
-steps NNS I-NP
-: : O
-
-1 CD B-LST
-. . O
-Make VB B-VP
-sure JJ B-ADJP
-you PRP B-NP
-have VBP B-VP
-a DT B-NP
-strong JJ I-NP
-personnel NNS I-NP
-department NN I-NP
-. . O
-
-It PRP B-NP
-must MD B-VP
-be VB I-VP
-able JJ B-ADJP
-to TO B-VP
-handle VB I-VP
-most RBS B-NP
-of IN B-PP
-the DT B-NP
-complaints NNS I-NP
-that WDT B-NP
-can MD B-VP
-not RB I-VP
-be VB I-VP
-solved VBN I-VP
-in IN B-PP
-the DT B-NP
-trenches NNS I-NP
-by IN B-PP
-managers NNS B-NP
-and CC O
-their PRP$ B-NP
-subordinates NNS I-NP
-, , O
-else RB B-ADVP
-the DT B-NP
-company NN I-NP
-court NN I-NP
-or CC I-NP
-adjudicators NNS I-NP
-will MD B-VP
-be VB B-VP
-inundated VBN I-VP
-with IN B-PP
-cases NNS B-NP
-. . O
-
-At IN B-PP
-Polaroid NNP B-NP
-, , O
-the DT B-NP
-Personnel NNP I-NP
-Policy NNP I-NP
-Planning NNP I-NP
-Committee NNP I-NP
-may MD B-VP
-hear VB I-VP
-only RB B-NP
-about IN I-NP
-20 CD I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-; : O
-the DT B-NP
-rest NN I-NP
-of IN B-PP
-the DT B-NP
-many JJ I-NP
-hundreds NNS I-NP
-of IN B-PP
-complaints NNS B-NP
-are VBP B-VP
-resolved VBN I-VP
-at IN B-PP
-earlier JJR B-NP
-stages NNS I-NP
-. . O
-
-At IN B-PP
-TWA NNP B-NP
-, , O
-the DT B-NP
-System NNP I-NP
-Board NNP I-NP
-of IN B-PP
-Adjustment NNP B-NP
-hears VBZ B-VP
-50 CD B-NP
-to TO I-NP
-75 CD I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-, , O
-only RB B-NP
-a DT I-NP
-fraction NN I-NP
-of IN B-PP
-the DT B-NP
-complaints NNS I-NP
-brought VBN B-VP
-to TO B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-. . O
-
-At IN B-PP
-Citicorp NNP B-NP
-, , O
-the DT B-NP
-Problem NNP I-NP
-Review NNP I-NP
-Board NNP I-NP
-may MD B-VP
-hear VB I-VP
-only RB B-NP
-12 CD I-NP
-or CC I-NP
-so RB I-NP
-cases VBZ I-NP
-because IN B-PP
-of IN I-PP
-personnel NNS B-NP
-'s POS B-NP
-skill NN I-NP
-in IN B-PP
-complaint-resolution NN B-NP
-. . O
-
-In IN B-PP
-a DT B-NP
-typical JJ I-NP
-year NN I-NP
-, , O
-up IN B-NP
-to TO I-NP
-20 CD I-NP
-% NN I-NP
-of IN B-PP
-the DT B-NP
-work NN I-NP
-force NN I-NP
-goes VBZ B-VP
-to TO B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-with IN B-PP
-complaints NNS B-NP
-of IN B-PP
-unfair JJ B-NP
-treatment NN I-NP
-. . O
-
-In IN B-PP
-a DT B-NP
-large JJ I-NP
-company NN I-NP
-that WDT B-NP
-means VBZ B-VP
-many JJ B-NP
-hundreds NNS I-NP
-of IN B-PP
-complaints NNS B-NP
-for IN B-PP
-personnel NNS B-NP
-to TO B-VP
-handle VB I-VP
-. . O
-
-2 CD B-LST
-. . O
-Formally RB B-ADVP
-or CC I-ADVP
-informally RB I-ADVP
-, , O
-train NN B-VP
-all DT B-NP
-your PRP$ I-NP
-managers NNS I-NP
-and CC I-NP
-supervisors NNS I-NP
-in IN B-PP
-the DT B-NP
-company NN I-NP
-'s POS B-NP
-due-process NN I-NP
-approach NN I-NP
-. . O
-
-See VB B-VP
-that IN B-SBAR
-they PRP B-NP
-know VBP B-VP
-company NN B-NP
-personnel NNS I-NP
-policy NN I-NP
-backwards RB B-ADVP
-and CC I-ADVP
-forwards RB I-ADVP
-, , O
-for IN O
-it PRP B-NP
-is VBZ B-VP
-the DT B-NP
-`` `` I-NP
-law NN I-NP
-'' '' O
-governing VBG B-VP
-company NN B-NP
-courts NNS I-NP
-and CC I-NP
-adjudicators NNS I-NP
-. . O
-
-Coach NNP B-VP
-them PRP B-NP
-in IN B-PP
-handling NN B-VP
-complaints NNS B-NP
-so RB B-SBAR
-that IN I-SBAR
-they PRP B-NP
-can MD B-VP
-resolve VB I-VP
-problems NNS B-NP
-immediately RB B-ADVP
-. . O
-
-In IN B-SBAR
-case NN O
-managers NNS B-NP
-and CC O
-personnel NNS B-NP
-specialists NNS I-NP
-are VBP B-VP
-unsuccessful JJ B-ADJP
-and CC O
-subordinates NNS B-NP
-take VBP B-VP
-their PRP$ B-NP
-complaints NNS I-NP
-to TO B-PP
-a DT B-NP
-company NN I-NP
-court NN I-NP
-or CC I-NP
-adjudicator NN I-NP
-, , O
-teach VB B-VP
-managers NNS B-NP
-to TO B-VP
-accept VB I-VP
-reversals NNS B-NP
-as IN B-PP
-a DT B-NP
-fact NN I-NP
-of IN B-PP
-business NN B-NP
-life NN I-NP
-, , O
-for IN O
-in IN B-PP
-a DT B-NP
-good JJ I-NP
-due-process NN I-NP
-system NN I-NP
-they PRP B-NP
-are VBP B-VP
-bound VBN I-VP
-to TO I-VP
-happen VB I-VP
-. . O
-
-In IN B-PP
-the DT B-NP
-15 CD I-NP
-companies NNS I-NP
-I PRP B-NP
-studied VBD B-VP
-, , O
-reversal NN B-NP
-rates NNS I-NP
-range VBP B-VP
-on IN B-PP
-the DT B-NP
-average NN I-NP
-from IN B-PP
-20 CD B-NP
-% NN I-NP
-to TO B-PP
-40 CD B-NP
-% NN I-NP
-. . O
-
-3 CD B-LST
-. . O
-Decide VB B-VP
-whether IN O
-you PRP B-NP
-want VBP B-VP
-a DT B-NP
-panel NN I-NP
-system NN I-NP
-or CC O
-a DT B-NP
-single JJ I-NP
-adjudicator NN I-NP
-. . O
-
-A DT B-NP
-panel NN I-NP
-system NN I-NP
-like IN B-PP
-that DT B-NP
-in NN B-PP
-the DT B-NP
-Bob NNP I-NP
-Stone NNP I-NP
-example NN I-NP
-enjoys VBZ B-VP
-such JJ B-NP
-advantages NNS I-NP
-as IN B-PP
-high JJ B-NP
-credibility NN I-NP
-and CC O
-, , O
-for IN B-PP
-the DT B-NP
-panelists NNS I-NP
-, , O
-mutual JJ B-NP
-support NN I-NP
-. . O
-
-An DT B-NP
-adjudicator NN I-NP
-system NN I-NP
--- : O
-that DT B-INTJ
-is VBZ I-INTJ
-, , O
-an DT B-NP
-investigator NN I-NP
-who WP B-NP
-acts VBZ B-VP
-first JJ B-ADVP
-as IN B-PP
-a DT B-NP
-fact-finder NN I-NP
-and CC O
-then RB O
-switches VBZ B-VP
-hats NNS B-NP
-and CC O
-arbitrates VBZ B-VP
-the DT B-NP
-facts NNS I-NP
--- : O
-has VBZ B-VP
-such JJ B-NP
-advantages NNS I-NP
-as IN B-PP
-speed NN B-NP
-, , O
-flexibility NN B-NP
-and CC O
-maximum JJ B-NP
-privacy NN I-NP
-. . O
-
-International NNP B-NP
-Business NNP I-NP
-Machines NNPS I-NP
-and CC O
-Bank NNP B-NP
-of IN B-PP
-America NNP B-NP
-are VBP B-VP
-among IN B-PP
-the DT B-NP
-companies NNS I-NP
-using VBG B-VP
-the DT B-NP
-single-adjudicator JJ I-NP
-approach NN I-NP
-. . O
-
-4 CD B-LST
-. . O
-Make VB B-VP
-your PRP$ B-NP
-due-process NN I-NP
-system NN I-NP
-visible JJ B-ADJP
-. . O
-
-It PRP B-NP
-wo MD B-VP
-n't RB I-VP
-do VB I-VP
-any DT B-NP
-good NN I-NP
-for IN B-PP
-anybody NN B-NP
-unless IN B-SBAR
-employees NNS B-NP
-know VBP B-VP
-about IN B-PP
-it PRP B-NP
-. . O
-
-Most JJS B-NP
-managements NNS I-NP
-hesitate VBP B-VP
-to TO I-VP
-go VB I-VP
-all DT B-ADVP
-out NN I-ADVP
-in IN B-PP
-advertising VBG B-VP
-their PRP$ B-NP
-due-process NN I-NP
-systems NNS I-NP
-for IN B-PP
-fear NN B-NP
-of IN B-PP
-encouraging VBG B-VP
-cranks NNS B-NP
-and CC O
-chronic JJ B-NP
-soreheads NNS I-NP
-to TO B-VP
-file VB I-VP
-complaints NNS B-NP
-. . O
-
-On IN B-PP
-the DT B-NP
-other JJ I-NP
-hand NN I-NP
-, , O
-they PRP B-NP
-make VBP B-VP
-sure JJ B-ADJP
-at IN B-PP
-a DT B-NP
-minimum NN I-NP
-that IN B-SBAR
-their PRP$ B-NP
-systems NNS I-NP
-are VBP B-VP
-described VBN I-VP
-in IN B-PP
-their PRP$ B-NP
-employee NN I-NP
-handbooks NNS I-NP
-and CC O
-talked VBD B-VP
-up IN B-PRT
-by IN B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-. . O
-
-Smith-Kline NNP B-NP
-Beecham NNP I-NP
-goes VBZ B-VP
-further JJ B-ADVP
-and CC O
-sometimes RB B-VP
-features VBZ I-VP
-its PRP$ B-NP
-grievance NN I-NP
-procedure NN I-NP
-in IN B-PP
-closed-circuit JJ B-NP
-TV NN I-NP
-programs NNS I-NP
-. . O
-
-Naturally RB B-ADVP
-, , O
-one CD B-NP
-of IN B-PP
-the DT B-NP
-best JJS I-NP
-ways NNS I-NP
-to TO B-VP
-guarantee VB I-VP
-visibility NN B-NP
-for IN B-PP
-your PRP$ B-NP
-due-process NN I-NP
-system NN I-NP
-is VBZ B-VP
-for IN B-SBAR
-top JJ B-NP
-management NN I-NP
-to TO B-VP
-support VB I-VP
-it PRP B-NP
-. . O
-
-At IN B-PP
-IBM NNP B-NP
-, , O
-the DT B-NP
-company NN I-NP
-'s POS B-NP
-Open NNP I-NP
-Door NNP I-NP
-system NN I-NP
-is VBZ B-VP
-sometimes RB B-ADVP
-the DT B-NP
-subject NN I-NP
-of IN B-PP
-memorandums NNS B-NP
-from IN B-PP
-the DT B-NP
-chief JJ I-NP
-executive NN I-NP
-. . O
-
-Federal NNP B-NP
-Express NNP I-NP
-goes VBZ B-VP
-further JJ B-ADVP
-in IN B-PP
-this DT B-NP
-respect NN I-NP
-than IN B-PP
-any DT B-NP
-company NN I-NP
-I PRP B-NP
-know VBP B-VP
-of IN B-PP
-with IN B-PP
-both DT B-NP
-Frederick NNP B-NP
-Smith NNP I-NP
-and CC O
-James NNP B-NP
-Barksdale NNP I-NP
-, , O
-chief JJ B-NP
-executive NN I-NP
-and CC O
-chief JJ B-NP
-operating VBG I-NP
-officer NN I-NP
-, , O
-respectively RB B-ADVP
-, , O
-sitting VBG B-VP
-in IN B-PRT
-on IN B-PP
-the DT B-NP
-Appeals NNP I-NP
-Board NNP I-NP
-almost RB B-NP
-every DT I-NP
-Tuesday NNP I-NP
-to TO B-VP
-decide VB I-VP
-cases NNS B-NP
-. . O
-
-Mr. NNP B-NP
-Ewing NNP I-NP
-is VBZ B-VP
-a DT B-NP
-consultant NN I-NP
-based VBN B-VP
-in IN B-PP
-Winchester NNP B-NP
-, , O
-Mass. NNP B-NP
-, , O
-and CC O
-author NN B-NP
-of IN B-PP
-`` `` O
-Justice NNP B-NP
-on IN B-PP
-the DT B-NP
-Job NNP I-NP
-: : O
-Resolving NNP B-VP
-Grievances NNP B-NP
-in IN B-PP
-the DT B-NP
-Nonunion NNP I-NP
-Workplace NN I-NP
-'' '' O
--LRB- ( O
-Harvard NNP B-NP
-Business NNP I-NP
-School NNP I-NP
-Press NNP I-NP
-, , O
-1989 CD B-NP
--RRB- ) O
-. . O
-
-Tokyo NNP B-NP
-stocks NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-active JJ B-NP
-trading NN I-NP
-Friday NNP B-NP
-, , O
-marking VBG B-VP
-the DT B-NP
-fourth JJ I-NP
-consecutive JJ I-NP
-daily JJ I-NP
-gain NN I-NP
-since IN B-PP
-Monday NNP B-NP
-'s POS B-NP
-sharp JJ I-NP
-fall NN I-NP
-. . O
-
-London JJ B-NP
-shares NNS I-NP
-closed VBD B-VP
-moderately RB B-ADVP
-lower JJR I-ADVP
-in IN B-PP
-thin JJ B-NP
-trading NN I-NP
-. . O
-
-At IN B-PP
-Tokyo NNP B-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-index NN I-NP
-of IN B-PP
-225 CD B-NP
-selected VBN I-NP
-issues NNS I-NP
-was VBD B-VP
-up IN B-ADVP
-112.16 CD B-NP
-points NNS I-NP
-to TO B-PP
-35486.38 CD B-NP
-. . O
-
-The DT B-NP
-index NN I-NP
-advanced VBD B-VP
-266.66 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-. . O
-
-In IN B-PP
-early JJ B-NP
-trading NN I-NP
-in IN B-PP
-Tokyo NNP B-NP
-Monday NNP B-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-index NN I-NP
-rose VBD B-VP
-101.98 CD B-NP
-points NNS I-NP
-to TO B-PP
-35588.36 CD B-NP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-volume NN I-NP
-on IN B-PP
-the DT B-NP
-First NNP I-NP
-Section NN I-NP
-was VBD B-VP
-estimated VBN I-VP
-at IN B-PP
-one CD B-NP
-billion CD I-NP
-shares NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-862 CD B-NP
-million CD I-NP
-Thursday NNP B-NP
-. . O
-
-Winners NNS B-NP
-outpaced VBD B-VP
-losers NNS B-NP
-, , O
-572 CD B-ADVP
-to TO I-ADVP
-368 CD I-ADVP
-, , O
-while IN B-SBAR
-181 CD B-NP
-issues NNS I-NP
-remained VBD B-VP
-unchanged JJ B-ADJP
-. . O
-
-With IN B-SBAR
-investors NNS B-NP
-relieved VBN B-ADJP
-at IN B-PP
-the DT B-NP
-overnight JJ I-NP
-gain NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-, , O
-small-lot JJ B-NP
-buying NN I-NP
-orders NNS I-NP
-streamed VBD B-VP
-into IN B-PP
-the DT B-NP
-market NN I-NP
-from IN B-PP
-early JJ B-NP
-morning NN I-NP
-, , O
-making VBG B-VP
-traders NNS B-NP
-believe VBP B-VP
-the DT B-NP
-market NN I-NP
-was VBD B-VP
-back RB B-ADVP
-to TO B-PP
-normal JJ B-NP
-. . O
-
-The DT B-NP
-Nikkei NNP I-NP
-, , O
-which WDT B-NP
-reached VBD B-VP
-as RB B-ADJP
-high JJ I-ADJP
-as IN B-PP
-35611.38 CD B-NP
-right NN B-ADVP
-after IN B-PP
-the DT B-NP
-opening NN I-NP
-, , O
-surrendered VBD B-VP
-part NN B-NP
-of IN B-PP
-its PRP$ B-NP
-early JJ I-NP
-advance NN I-NP
-toward IN B-PP
-the DT B-NP
-end NN I-NP
-of IN B-PP
-the DT B-NP
-day NN I-NP
-because IN B-PP
-of IN I-PP
-profit-taking NN B-NP
-. . O
-
-`` `` O
-Investors NNS B-NP
-, , B-NP
-especially RB I-NP
-dealers NNS B-NP
-, , O
-do VBP B-VP
-n't RB I-VP
-want VB I-VP
-to TO I-VP
-hold VB I-VP
-a DT B-NP
-position NN I-NP
-over IN B-PP
-the DT B-NP
-weekend NN I-NP
-, , O
-'' '' O
-a DT B-NP
-trader NN I-NP
-at IN B-PP
-Dai-ichi NNP B-NP
-Securities NNP I-NP
-said VBD B-VP
-, , O
-adding VBG B-VP
-, , O
-though RB B-ADVP
-, , O
-that IN B-SBAR
-the DT B-NP
-trading NN I-NP
-mood NN I-NP
-remained VBD B-VP
-positive JJ B-ADJP
-through IN B-PP
-the DT B-NP
-afternoon NN I-NP
-session NN I-NP
-. . O
-
-The DT B-NP
-Tokyo NNP I-NP
-Stock NNP I-NP
-Price NNP I-NP
-Index NNP I-NP
--LRB- ( O
-Topix NNP B-NP
--RRB- ) O
-of IN B-PP
-all DT B-NP
-issues NNS I-NP
-listed VBN B-VP
-in IN B-PP
-the DT B-NP
-First NNP I-NP
-Section NN I-NP
-, , O
-which WDT B-NP
-gained VBD B-VP
-22.78 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-, , O
-was VBD B-VP
-up IN B-ADVP
-14.06 CD B-NP
-points NNS I-NP
-, , O
-or CC O
-0.53 CD B-NP
-% NN I-NP
-, , O
-at IN B-PP
-2679.72 CD B-NP
-. . O
-
-The DT B-NP
-Second JJ I-NP
-Section NN I-NP
-index NN I-NP
-, , O
-which WDT B-NP
-rose VBD B-VP
-15.72 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-, , O
-was VBD B-VP
-up IN B-ADVP
-11.88 CD B-NP
-points NNS I-NP
-, , O
-or CC O
-0.32 CD B-NP
-% NN I-NP
-, , O
-to TO B-VP
-close VB I-VP
-at IN B-PP
-3717.46 CD B-NP
-. . O
-
-Volume NN B-NP
-in IN B-PP
-the DT B-NP
-second JJ I-NP
-section NN I-NP
-was VBD B-VP
-estimated VBN I-VP
-at IN B-PP
-30 CD B-NP
-million CD I-NP
-shares NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-28 CD B-NP
-million CD I-NP
-Thursday NNP B-NP
-. . O
-
-In IN B-PP
-turmoil NN B-NP
-caused VBN B-VP
-by IN B-PP
-the DT O
-previous JJ B-NP
-Friday NNP I-NP
-'s POS B-NP
-plunge NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-marked VBD B-VP
-a DT B-NP
-sharp JJ I-NP
-647.33-point JJ I-NP
-fall NN I-NP
-Monday NNP B-NP
-. . O
-
-But CC O
-the DT B-NP
-Nikkei NNP I-NP
-fell VBD B-VP
-an DT B-NP
-overall JJ I-NP
-1.8 CD I-NP
-% NN I-NP
-in IN B-PP
-value NN B-NP
-that DT B-NP
-day NN I-NP
-compared VBN B-PP
-with IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-'s POS I-NP
-far RB B-ADJP
-sharper JJR I-ADJP
-6.9 CD B-ADJP
-% NN I-ADJP
-drop NN B-NP
-on IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-. . O
-
-The DT B-NP
-Tokyo NNP I-NP
-market NN I-NP
-'s POS B-NP
-resiliency NN I-NP
-helped VBD B-VP
-participants NNS B-NP
-to TO B-VP
-regain VB I-VP
-confidence NN B-NP
-gradually RB B-ADVP
-as IN B-SBAR
-they PRP B-NP
-spent VBD B-VP
-more JJR B-NP
-time NN I-NP
-on IN B-PP
-analyzing VBG B-VP
-factors NNS B-NP
-that WDT B-NP
-caused VBD B-VP
-the DT B-NP
-Friday NNP I-NP
-plunge NN I-NP
-and CC O
-realized VBD B-VP
-these DT B-NP
-problems NNS I-NP
-were VBD B-VP
-unique JJ B-ADJP
-to TO B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-and CC B-ADJP
-not RB I-ADJP
-directly RB B-ADJP
-related VBN I-ADJP
-to TO B-PP
-Tokyo NNP B-NP
-. . O
-
-The DT B-NP
-Nikkei NNP I-NP
-continued VBD B-VP
-to TO I-VP
-gain VB I-VP
-for IN B-PP
-the DT B-NP
-rest NN I-NP
-of IN B-PP
-the DT B-NP
-week NN I-NP
-, , O
-adding VBG B-VP
-1017.69 CD B-NP
-points NNS I-NP
-in IN B-PP
-four CD B-NP
-days NNS I-NP
--- : O
-more JJR B-VP
-than IN I-VP
-erasing VBG I-VP
-Monday NNP B-NP
-'s POS B-NP
-losses NNS I-NP
-. . O
-
-But CC O
-further JJ B-NP
-major JJ I-NP
-advances NNS I-NP
-on IN B-PP
-the DT B-NP
-Nikkei NNP I-NP
-are VBP B-VP
-n't RB I-VP
-foreseen VBN I-VP
-this DT B-NP
-week NN I-NP
-by IN B-PP
-market NN B-NP
-observers NNS I-NP
-. . O
-
-Investors NNS B-NP
-are VBP B-VP
-still RB I-VP
-waiting VBG I-VP
-to TO I-VP
-see VB I-VP
-how WRB B-ADVP
-the DT B-NP
-U.S. NNP I-NP
-government NN I-NP
-will MD B-VP
-decide VB I-VP
-on IN B-PP
-interest NN B-NP
-rates NNS I-NP
-and CC O
-how WRB B-ADVP
-the DT B-NP
-dollar NN I-NP
-will MD B-VP
-be VB I-VP
-stabilized VBN I-VP
-. . O
-
-Some DT B-NP
-high-priced JJ I-NP
-issues NNS I-NP
-made VBD B-VP
-a DT B-NP
-comeback NN I-NP
-Friday NNP B-NP
-. . O
-
-Pioneer NNP B-NP
-surged VBD B-VP
-450 CD B-NP
-yen NN I-NP
--LRB- ( O
-$ $ B-NP
-3.16 CD I-NP
--RRB- ) O
-to TO B-PP
-6,050 CD B-NP
-yen NN I-NP
--LRB- ( O
-$ $ B-NP
-42.60 CD I-NP
--RRB- ) O
-. . O
-
-Kyocera NNP B-NP
-advanced VBD B-VP
-80 CD B-NP
-yen NN I-NP
-to TO B-PP
-5,440 CD B-NP
-. . O
-
-Fanuc NNP B-NP
-gained VBD B-VP
-100 CD B-NP
-to TO B-PP
-7,580 CD B-NP
-. . O
-
-Breweries NNP B-NP
-attracted VBD B-VP
-investors NNS B-NP
-because IN B-PP
-of IN I-PP
-their PRP$ B-NP
-land NN I-NP
-property NN I-NP
-holdings NNS I-NP
-that WDT B-NP
-could MD B-VP
-figure VB I-VP
-in IN B-PP
-development NN B-NP
-or CC O
-other JJ B-NP
-plans NNS I-NP
-, , O
-traders NNS B-NP
-said VBD B-VP
-. . O
-
-Sapporo NNP B-NP
-gained VBD B-VP
-80 CD B-NP
-to TO B-PP
-1,920 CD B-NP
-and CC O
-Kirin NNP B-NP
-added VBD B-VP
-60 CD B-NP
-to TO B-PP
-2,070 CD B-NP
-. . O
-
-Housings NNS B-NP
-, , I-NP
-constructions NNS I-NP
-and CC I-NP
-pharmaceuticals NNS I-NP
-continued VBD B-VP
-to TO I-VP
-be VB I-VP
-bought VBN I-VP
-following VBG B-PP
-Thursday NNP B-NP
-'s POS B-NP
-gains NNS I-NP
-because IN B-PP
-of IN I-PP
-strong JJ B-NP
-earnings NNS I-NP
-outlooks NNS I-NP
-. . O
-
-Daiwa NNP B-NP
-House NNP I-NP
-gained VBD B-VP
-50 CD B-NP
-to TO B-PP
-2,660 CD B-NP
-. . O
-
-Misawa NNP B-NP
-Homes NNP I-NP
-was VBD B-VP
-up IN B-ADVP
-20 CD B-NP
-at IN B-PP
-2,960 CD B-NP
-. . O
-
-Kajima NNP B-NP
-advanced VBD B-VP
-40 CD B-NP
-to TO B-PP
-2,120 CD B-NP
-and CC O
-Ohbayashi NNP B-NP
-added VBD B-VP
-50 CD B-NP
-to TO B-PP
-1,730 CD B-NP
-. . O
-
-Fujisawa NNP B-NP
-added VBD B-VP
-80 CD B-NP
-to TO B-PP
-2,010 CD B-NP
-and CC O
-Mochida NNP B-NP
-advanced VBD B-VP
-230 CD B-NP
-to TO B-PP
-4,400 CD B-NP
-. . O
-
-London JJ B-NP
-share NN I-NP
-prices NNS I-NP
-were VBD B-VP
-influenced VBN I-VP
-largely RB B-ADVP
-by IN B-PP
-declines NNS B-NP
-on IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-and CC O
-weakness NN B-NP
-in IN B-PP
-the DT B-NP
-British JJ I-NP
-pound NN I-NP
-. . O
-
-The DT B-NP
-key JJ I-NP
-Financial NNP I-NP
-Times-Stock NNP I-NP
-Exchange NNP I-NP
-100-share JJ I-NP
-index NN I-NP
-ended VBD B-VP
-10.2 CD B-NP
-points NNS I-NP
-lower JJR B-ADVP
-at IN B-PP
-2179.1 CD B-NP
-, , O
-above IN B-ADVP
-its PRP$ B-NP
-intraday JJ I-NP
-low NN I-NP
-of IN B-PP
-2176.9 CD B-NP
-, , B-ADVP
-but CC I-ADVP
-off IN B-ADVP
-the DT B-NP
-day NN I-NP
-'s POS I-NP
-high NN B-NP
-of IN B-PP
-2189 CD B-NP
-. . O
-
-The DT B-NP
-index NN I-NP
-finished VBD B-VP
-2.4 CD B-NP
-% NN I-NP
-under IN B-PP
-its PRP$ B-NP
-close NN I-NP
-of IN B-PP
-2233.9 CD B-NP
-the DT B-NP
-previous JJ I-NP
-Friday NNP I-NP
-, , O
-although IN B-SBAR
-it PRP B-NP
-recouped VBD B-VP
-some DT B-NP
-of IN B-PP
-the DT B-NP
-sharp JJ I-NP
-losses NNS I-NP
-staged VBD B-VP
-early JJ B-NP
-last JJ I-NP
-week NN I-NP
-on IN B-PP
-the DT B-NP
-back RB I-NP
-of IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-'s POS B-NP
-fall NN I-NP
-. . O
-
-London NNP B-NP
-was VBD B-VP
-weak JJ B-ADJP
-throughout IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-trading NN I-NP
-, , O
-however RB B-ADVP
-, , O
-on IN B-PP
-what WP B-NP
-dealers NNS B-NP
-attributed VBD B-VP
-to TO B-PP
-generally RB B-NP
-thin JJ I-NP
-interest NN I-NP
-ahead RB B-ADVP
-of IN B-PP
-the DT B-NP
-weekend NN I-NP
-and CC O
-this DT B-NP
-week NN I-NP
-'s POS I-NP
-potentially RB B-ADJP
-important JJ I-ADJP
-U.K. NNP B-NP
-trade NN I-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-. . O
-
-The DT B-NP
-FT-SE NNP I-NP
-100 CD I-NP
-largely RB B-ADVP
-remained VBD B-VP
-within IN B-PP
-an DT B-NP
-11-point JJ I-NP
-range NN I-NP
-establshed VBN B-VP
-within IN B-PP
-the DT B-NP
-first JJ I-NP
-hour NN I-NP
-of IN B-PP
-trading NN B-NP
-before IN B-PP
-it PRP B-NP
-eased VBD B-VP
-to TO B-PP
-an DT B-NP
-intraday JJ I-NP
-low JJ I-NP
-late RB B-ADVP
-in IN B-PP
-the DT B-NP
-session NN I-NP
-when WRB B-ADVP
-a DT B-NP
-flurry NN I-NP
-of IN B-PP
-program NN B-NP
-selling VBG I-NP
-pushed VBN B-VP
-Wall NNP B-NP
-Street NNP I-NP
-lower JJR B-ADVP
-. . O
-
-The DT B-NP
-FT NNP I-NP
-30-share JJ I-NP
-index NN I-NP
-closed VBD B-VP
-11.0 CD B-NP
-points NNS I-NP
-lower JJR B-ADVP
-at IN B-PP
-1761.0 CD B-NP
-. . O
-
-Volume NN B-NP
-was VBD B-VP
-extremely RB B-ADJP
-thin JJ I-ADJP
-at IN B-PP
-351.3 CD B-NP
-million CD I-NP
-shares NNS I-NP
-, , O
-the DT B-NP
-lightest JJS I-NP
-volume NN I-NP
-of IN B-PP
-the DT B-NP
-week NN I-NP
-and CC O
-modestly RB B-ADVP
-under IN B-PP
-Thursday NNP B-NP
-'s POS B-NP
-387.4 CD I-NP
-million CD I-NP
-shares NNS I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-the DT B-NP
-day NN I-NP
-'s POS B-NP
-action NN I-NP
-was VBD B-VP
-featureless JJ B-ADJP
-outside IN B-PP
-some DT B-NP
-response NN I-NP
-to TO B-PP
-sterling NN B-NP
-'s POS B-NP
-early JJ I-NP
-weakness NN I-NP
-against IN B-PP
-the DT B-NP
-mark NN I-NP
-, , O
-and CC O
-fears NNS B-NP
-that IN B-SBAR
-Wall NNP B-NP
-Street NNP I-NP
-might MD B-VP
-open RB I-VP
-lower JJR B-ADVP
-after IN B-PP
-its PRP$ B-NP
-strong JJ I-NP
-leap NN I-NP
-forward RB B-ADVP
-Thursday NNP B-NP
-. . O
-
-They PRP B-NP
-added VBD B-VP
-that IN B-SBAR
-market-makers NNS B-NP
-were VBD B-VP
-largely RB I-VP
-sidelined VBN I-VP
-after IN B-PP
-aggressively RB B-VP
-supporting VBG I-VP
-the DT B-NP
-market NN I-NP
-Thursday NNP B-NP
-in IN B-PP
-their PRP$ B-NP
-quest NN I-NP
-to TO B-VP
-cover VB I-VP
-internal JJ B-NP
-shortages NNS I-NP
-of IN B-PP
-FT-SE NNP B-NP
-100 CD I-NP
-shares NNS I-NP
-. . O
-
-Interest NN B-NP
-may MD B-VP
-remain VB I-VP
-limited JJ B-ADJP
-into IN B-PP
-tomorrow NN B-NP
-'s POS B-NP
-U.K. NNP I-NP
-trade NN I-NP
-figures NNS I-NP
-, , O
-which WDT B-NP
-the DT B-NP
-market NN I-NP
-will MD B-VP
-be VB I-VP
-watching VBG I-VP
-closely RB B-ADVP
-to TO B-VP
-see VB I-VP
-if IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-any DT B-NP
-improvement NN I-NP
-after IN B-PP
-disappointing JJ B-NP
-numbers NNS I-NP
-in IN B-PP
-the DT B-NP
-previous JJ I-NP
-two CD I-NP
-months NNS I-NP
-. . O
-
-The DT B-NP
-key JJ I-NP
-corporate JJ I-NP
-news NN I-NP
-of IN B-PP
-the DT B-NP
-day NN I-NP
-was VBD B-VP
-that IN B-SBAR
-British JJ B-NP
-Airways NNPS I-NP
-decided VBD B-VP
-to TO I-VP
-withdraw VB I-VP
-from IN B-PP
-a DT B-NP
-management-led JJ I-NP
-bid NN I-NP
-for IN B-PP
-UAL NNP B-NP
-Corp. NNP I-NP
-, , O
-the DT B-NP
-parent NN I-NP
-of IN B-PP
-United NNP B-NP
-Airlines NNPS I-NP
-. . O
-
-British JJ B-NP
-Airways NNPS I-NP
-rose VBD B-VP
-initially RB B-ADVP
-after IN B-PP
-announcing VBG B-VP
-its PRP$ B-NP
-withdrawal NN I-NP
-from IN B-PP
-the DT B-NP
-UAL NNP I-NP
-deal NN I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-they PRP B-NP
-viewed VBD B-VP
-the DT O
-initial JJ O
-# # O
-390-million CD O
--LRB- ( O
-$ $ B-ADJP
-622 CD O
-million CD O
--RRB- ) O
-outlay NN B-NP
-for IN B-PP
-a DT B-NP
-15 CD I-NP
-% NN I-NP
-stake NN I-NP
-in IN B-PP
-the DT B-NP
-airline NN I-NP
-as IN B-PP
-a DT B-NP
-bit NN I-NP
-much JJ I-NP
-. . O
-
-Its PRP$ B-NP
-shares NNS I-NP
-slid VBD B-VP
-in IN B-PP
-late JJ B-NP
-dealings NNS I-NP
-to TO B-VP
-close VB I-VP
-a DT B-NP
-penny NN I-NP
-per IN B-PP
-share NN B-NP
-lower JJR B-ADVP
-at IN B-PP
-197 CD B-NP
-pence NN I-NP
-. . O
-
-The DT B-NP
-airline NN I-NP
-was VBD B-VP
-the DT B-NP
-most RBS I-NP
-active JJ I-NP
-FT-SE NNP I-NP
-100 CD I-NP
-at IN B-PP
-8.2 CD B-NP
-million CD I-NP
-shares NNS I-NP
-traded VBN B-VP
-. . O
-
-The DT B-NP
-next JJ I-NP
-most RBS I-NP
-active JJ I-NP
-top-tier JJ I-NP
-stock NN I-NP
-was VBD B-VP
-B.A.T NNP B-NP
-Industries NNPS I-NP
-, , O
-the DT B-NP
-target NN I-NP
-of IN B-PP
-Sir NNP B-NP
-James NNP I-NP
-Goldsmith NNP I-NP
-'s POS B-NP
-# # B-ADJP
-13.4 CD O
-billion CD O
-bid NN B-NP
-. . O
-
-The DT B-NP
-company NN I-NP
-gained VBD B-VP
-shareholder NN B-NP
-approval NN I-NP
-Thursday NNP B-NP
-to TO B-VP
-restructure VB I-VP
-in IN B-PP
-a DT B-NP
-bid NN I-NP
-to TO B-VP
-fend VB I-VP
-off IN B-PRT
-the DT B-NP
-hostile JJ I-NP
-takeover NN I-NP
-. . O
-
-Sir NNP B-NP
-James NNP I-NP
-said VBD B-VP
-Thursday NNP B-NP
-night NN I-NP
-that IN B-SBAR
-his PRP$ B-NP
-plans NNS I-NP
-for IN B-PP
-the DT B-NP
-takeover NN I-NP
-had VBD B-VP
-n't RB I-VP
-changed VBN I-VP
-. . O
-
-B.A.T NNP B-NP
-ended VBD B-VP
-the DT B-NP
-day NN I-NP
-at IN B-PP
-778 CD B-NP
-, , O
-down JJ B-ADVP
-5 NN B-NP
-, , O
-on IN B-PP
-turnover NN B-NP
-of IN B-PP
-7.5 CD B-NP
-million CD I-NP
-shares NNS I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-it PRP B-NP
-was VBD B-VP
-hit VBN I-VP
-by IN B-PP
-some DT B-NP
-profit-taking NN I-NP
-after IN B-PP
-gains NNS B-NP
-since IN B-PP
-mid-week NN B-NP
-. . O
-
-In IN B-PP
-other JJ B-NP
-active JJ I-NP
-shares NNS I-NP
-, , O
-Trusthouse NNP B-NP
-Forte NNP I-NP
-shed VB B-VP
-10 CD B-NP
-to TO B-PP
-294 CD B-NP
-on IN B-PP
-volume NN B-NP
-of IN B-PP
-6.4 CD B-NP
-million CD I-NP
-shares NNS I-NP
-after IN B-PP
-a DT B-NP
-Barclays NNP I-NP
-De NNP I-NP
-Zoete NNP I-NP
-Wedd NNP I-NP
-downgrading NN I-NP
-, , O
-while IN B-SBAR
-Hillsdown NNP B-NP
-Holdings NNP I-NP
-, , O
-a DT B-NP
-food NN I-NP
-products NNS I-NP
-concern VBP I-NP
-, , O
-was VBD B-VP
-boosted VBN I-VP
-2 CD B-NP
-to TO B-PP
-271 CD B-NP
-after IN O
-it PRP B-NP
-disclosed VBD B-VP
-it PRP B-NP
-would MD B-VP
-seek VB I-VP
-shareholder NN B-NP
-approval NN I-NP
-to TO B-VP
-begin VB I-VP
-share NN B-NP
-repurchases NNS I-NP
-. . O
-
-Elsewhere RB B-ADVP
-in IN B-PP
-Europe NNP B-NP
-, , O
-share NN B-NP
-prices NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-Stockholm NNP B-NP
-, , I-NP
-Brussels NNP I-NP
-and CC I-NP
-Milan NNP I-NP
-. . O
-
-Prices NNS B-NP
-were VBD B-VP
-lower JJR B-ADJP
-in IN B-PP
-Frankfurt NNP B-NP
-, , I-NP
-Zurich NNP I-NP
-, , I-NP
-Paris NNP I-NP
-and CC I-NP
-Amsterdam NNP I-NP
-. . O
-
-South JJ B-NP
-African JJ I-NP
-gold NN I-NP
-stocks NNS I-NP
-closed VBD B-VP
-moderately RB B-ADVP
-lower JJR I-ADVP
-. . O
-
-Share NN B-NP
-prices NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-Sydney NNP B-NP
-, , O
-Taipei NNP B-NP
-, , O
-Wellington NNP B-NP
-, , O
-Manila NNP B-NP
-, , O
-Hong NNP B-NP
-Kong NNP I-NP
-and CC O
-Singapore NNP B-NP
-and CC O
-were VBD B-VP
-lower JJR B-ADJP
-in IN B-PP
-Seoul NNP B-NP
-. . O
-
-Here RB B-ADVP
-are VBP B-VP
-price NN B-NP
-trends NNS I-NP
-on IN B-PP
-the DT B-NP
-world NN I-NP
-'s POS B-NP
-major JJ I-NP
-stock NN I-NP
-markets NNS I-NP
-, , O
-as IN B-SBAR
-calculated VBN B-VP
-by IN B-PP
-Morgan NNP B-NP
-Stanley NNP I-NP
-Capital NNP I-NP
-International NNP I-NP
-Perspective NNP I-NP
-, , O
-Geneva NNP B-NP
-. . O
-
-To TO B-VP
-make VB I-VP
-them PRP B-NP
-directly RB B-ADJP
-comparable JJ I-ADJP
-, , O
-each DT B-NP
-index NN I-NP
-is VBZ B-VP
-based VBN I-VP
-on IN B-PP
-the DT B-NP
-close NN I-NP
-of IN B-PP
-1969 CD B-NP
-equaling VBG B-VP
-100 CD B-NP
-. . O
-
-The DT B-NP
-percentage NN I-NP
-change NN I-NP
-is VBZ B-VP
-since IN B-PP
-year-end NN B-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-is VBZ B-VP
-required VBN I-VP
-to TO I-VP
-notify VB I-VP
-foreign JJ B-NP
-dictators NNS I-NP
-if IN B-SBAR
-it PRP B-NP
-knows VBZ B-VP
-of IN B-PP
-coup NN B-NP
-plans NNS I-NP
-likely JJ B-ADJP
-to TO B-VP
-endanger VB I-VP
-their PRP$ B-NP
-lives NNS I-NP
-, , O
-government NN B-NP
-officials NNS I-NP
-said VBD B-VP
-. . O
-
-The DT B-NP
-notification NN I-NP
-policy NN I-NP
-was VBD B-VP
-part NN B-NP
-of IN B-PP
-a DT B-NP
-set NN I-NP
-of IN B-PP
-guidelines NNS B-NP
-on IN B-PP
-handling NN B-VP
-coups NNS B-NP
-outlined VBN B-VP
-in IN B-PP
-a DT B-NP
-secret JJ I-NP
-1988 CD I-NP
-exchange NN I-NP
-of IN B-PP
-letters NNS B-NP
-between IN B-PP
-the DT B-NP
-Reagan NNP I-NP
-administration NN I-NP
-and CC O
-the DT B-NP
-Senate NNP I-NP
-Intelligence NNP I-NP
-Committee NNP I-NP
-. . O
-
-The DT B-NP
-existence NN I-NP
-of IN B-PP
-the DT B-NP
-guidelines NNS I-NP
-has VBZ B-VP
-become VBN I-VP
-known VBN I-VP
-since IN B-SBAR
-President NNP B-NP
-Bush NNP I-NP
-disclosed VBD B-VP
-them PRP B-NP
-privately RB B-ADVP
-to TO B-PP
-seven CD B-NP
-Republican NNP I-NP
-senators NNS I-NP
-at IN B-PP
-a DT B-NP
-White NNP I-NP
-House NNP I-NP
-meeting NN I-NP
-last JJ B-NP
-Monday NNP I-NP
-. . O
-
-Officials NNS B-NP
-familiar JJ B-ADJP
-with IN B-PP
-the DT B-NP
-meeting NN I-NP
-said VBD B-VP
-Mr. NNP B-NP
-Bush NNP I-NP
-cited VBD B-VP
-the DT B-NP
-policy NN I-NP
-as IN B-PP
-an DT B-NP
-example NN I-NP
-of IN B-PP
-the DT B-NP
-sort NN I-NP
-of IN B-PP
-congressional JJ B-NP
-requirements NNS I-NP
-the DT B-NP
-administration NN I-NP
-contends VBZ B-VP
-contribute VB B-VP
-to TO B-PP
-the DT B-NP
-failure NN I-NP
-of IN B-PP
-such JJ B-NP
-covert JJ I-NP
-actions NNS I-NP
-as IN B-PP
-this DT B-NP
-month NN I-NP
-'s POS B-NP
-futile JJ I-NP
-effort NN I-NP
-to TO B-VP
-oust VB I-VP
-Panamanian JJ B-NP
-dictator NN I-NP
-Manuel NNP I-NP
-Noriega NNP I-NP
-. . O
-
-According VBG B-PP
-to TO B-PP
-the DT B-NP
-officials NNS I-NP
-, , O
-Mr. NNP B-NP
-Bush NNP I-NP
-even RB B-ADVP
-read VB B-VP
-to TO B-PP
-the DT B-NP
-senators NNS I-NP
-selections NNS B-NP
-from IN B-PP
-a DT B-NP
-highly RB I-NP
-classified VBN I-NP
-letter NN I-NP
-from IN B-PP
-the DT B-NP
-committee NN I-NP
-to TO B-PP
-the DT B-NP
-White NNP I-NP
-House NNP I-NP
-discussing VBG B-VP
-the DT B-NP
-guidelines NNS I-NP
-. . O
-
-They PRP B-NP
-said VBD B-VP
-the DT B-NP
-president NN I-NP
-conceded VBD B-VP
-the DT B-NP
-notification NN I-NP
-requirement NN I-NP
-did VBD B-VP
-n't RB I-VP
-affect VB I-VP
-his PRP$ B-NP
-decision NN I-NP
-to TO B-VP
-lend VB I-VP
-only RB B-NP
-minor JJ I-NP
-support NN I-NP
-to TO B-PP
-this DT B-NP
-month NN I-NP
-'s POS B-NP
-Panama NNP I-NP
-coup NN I-NP
-effort NN I-NP
-. . O
-
-No DT B-NP
-notification NN I-NP
-was VBD B-VP
-ever RB I-VP
-considered VBN I-VP
-, , O
-officials NNS B-NP
-said VBD B-VP
-, , O
-apparently RB B-ADVP
-because IN B-SBAR
-the DT B-NP
-U.S. NNP I-NP
-did VBD B-VP
-n't RB I-VP
-think VB I-VP
-the DT B-NP
-coup NN I-NP
-plotters NNS I-NP
-intended VBN B-VP
-to TO I-VP
-kill VB I-VP
-Mr. NNP B-NP
-Noriega NNP I-NP
-, , O
-but CC O
-merely RB B-VP
-sought VBD I-VP
-to TO I-VP
-imprison VB I-VP
-him PRP B-NP
-. . O
-
-What WP B-NP
-'s VBZ B-VP
-more JJR B-NP
-, , O
-both DT B-NP
-administration NN B-NP
-and CC O
-congressional JJ B-NP
-officials NNS I-NP
-hint VBP B-VP
-that IN B-SBAR
-the DT B-NP
-notification NN I-NP
-requirement NN I-NP
-is VBZ B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-dropped VBN I-VP
-from IN B-PP
-the DT B-NP
-guidelines NNS I-NP
-on IN B-PP
-coup NN B-NP
-attempts NNS I-NP
-that WDT B-NP
-are VBP B-VP
-being VBG I-VP
-rewritten VBN I-VP
-by IN B-PP
-the DT B-NP
-panel NN I-NP
-and CC O
-the DT B-NP
-White NNP I-NP
-House NNP I-NP
-. . O
-
-The DT B-NP
-rewriting VBG I-NP
-was VBD B-VP
-launched VBN I-VP
-at IN B-PP
-a DT B-NP
-meeting NN I-NP
-between IN B-PP
-Mr. NNP B-NP
-Bush NNP I-NP
-and CC O
-intelligence NN B-NP
-committee NN I-NP
-leaders NNS I-NP
-Oct. NNP B-NP
-12 CD I-NP
-, , O
-a DT B-NP
-few JJ I-NP
-days NNS I-NP
-before IN B-PP
-the DT B-NP
-meeting NN I-NP
-at IN B-PP
-which WDT B-NP
-the DT B-NP
-president NN I-NP
-complained VBD B-VP
-about IN B-PP
-the DT B-NP
-rules NNS I-NP
-. . O
-
-However RB B-ADVP
-, , O
-the DT B-NP
-disclosure NN I-NP
-of IN B-PP
diff --git a/paddle/trainer/tests/train_files.txt b/paddle/trainer/tests/train_files.txt
deleted file mode 100644
index 1c268914953ff090ae47c56051fcf1cad0e1707b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/train_files.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/train_proto.bin
diff --git a/paddle/trainer/tests/train_sparse.list b/paddle/trainer/tests/train_sparse.list
deleted file mode 100644
index 6ea020e2202f8464f8a647cd96c84a9d17a03ae3..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/train_sparse.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/compare_sparse_data
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 2c2cc6245932d4af56a68d6399ce31f008bf3748..2fcdbbc8bd671f8ae911cf82c7a91091f252a82f 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -540,6 +540,13 @@ message LayerConfig {
 
   // for switch order layer
   optional ReshapeConfig reshape_conf = 59;
+
+  // for batch normalization layer
+  // The small constant added to the variance to improve numeric stability.
+  optional double epsilon = 60 [ default = 0.00001 ];
+
+  // for factorization machine layer
+  optional uint32 factor_size = 61;
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5bd68e211ac1c8e05f40dc3ca37eef99f32af47f..5b173694dd0e4a52c0179f12f5edd74e2c41cb8c 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1116,35 +1116,6 @@ def PyData(files=None,
     return data_config
 
 
-@config_func
-def ProtoData(files=None,
-              type=None,
-              file_group_queue_capacity=None,
-              load_file_count=None,
-              constant_slots=None,
-              load_thread_num=None,
-              **xargs):
-    data_config = create_data_config_proto(**xargs)
-    if type is None:
-        data_config.type = 'proto'
-    else:
-        data_config.type = type
-    data_config.files = files
-
-    # When type="proto_group", one data provider contains at most
-    # load_file_count files, and there are at most
-    # (queue_capacity + load_thread_num + 1) data providers in memory
-    if file_group_queue_capacity is not None:
-        data_config.file_group_conf.queue_capacity = file_group_queue_capacity
-    if load_file_count is not None:
-        data_config.file_group_conf.load_file_count = load_file_count
-    if load_thread_num is not None:
-        data_config.file_group_conf.load_thread_num = load_thread_num
-    if constant_slots:
-        data_config.constant_slots.extend(constant_slots)
-    return data_config
-
-
 #real data for training is actually provided by "sub_data" data providers.
 @config_func
 def MultiData(sub_data=[]):
@@ -1826,7 +1797,7 @@ class FCLayer(LayerBase):
             self.layer_type = 'mkldnn_fc'
             config_assert(
                 len(inputs) == 1,
-                "MkldnnFCLayer support one and only one input!")
+                "MKLDNNFCLayer support one and only one input!")
         super(FCLayer, self).__init__(
             name, self.layer_type, size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
@@ -1837,7 +1808,7 @@ class FCLayer(LayerBase):
             sparse = format == "csr" or format == "csc"
             if use_mkldnn:
                 config_assert(not sparse,
-                              "MkldnnFCLayer do not support sparse format yet")
+                              "MKLDNNFCLayer do not support sparse format yet")
                 if use_mkldnn_wgt:
                     dims = [self.config.size, input_layer.size]
             if sparse:
@@ -1853,7 +1824,7 @@ class FCLayer(LayerBase):
 
 
 @config_layer('mkldnn_fc')
-class MkldnnFcLayer(FCLayer):
+class MKLDNNFcLayer(FCLayer):
     layer_type = 'mkldnn_fc'
 
 
@@ -2066,13 +2037,20 @@ class ParameterReluLayer(LayerBase):
     def __init__(self, name, inputs, partial_sum=1, **args):
         super(ParameterReluLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **args)
+
         input_layer = self.get_input_layer(0)
         config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
         config_assert(input_layer.size % partial_sum == 0,
                       "a wrong setting for partial_sum")
+
+        dims = [1, input_layer.size / partial_sum]
         self.set_layer_size(input_layer.size)
         self.config.partial_sum = partial_sum
-        self.create_input_parameter(0, input_layer.size / partial_sum)
+        self.create_input_parameter(0, input_layer.size / partial_sum, dims)
+
+        self.set_layer_height_width(self.get_input_layer(0).height, \
+                                        self.get_input_layer(0).width)
+        self.set_layer_depth(self.get_input_layer(0).depth)
 
 
 @config_layer('conv')
@@ -2422,6 +2400,14 @@ class CropLayer(LayerBase):
         image_conf.img_size_y = input_layer.height
         image_conf.channels = input_layer.size / (input_layer.width *
                                                   input_layer.height)
+        # only support for 4-dims inputs and NCHW order
+        if (len(self.config.inputs) == 2):
+            self.set_layer_height_width(
+                self.get_input_layer(1).height, self.get_input_layer(1).width)
+            self.set_layer_size(self.get_input_layer(1).size)
+        else:
+            self.set_layer_height_width(shape[-2], shape[-1])
+            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
 
 
 @config_layer('batch_norm')
@@ -2434,6 +2420,7 @@ class BatchNormLayer(LayerBase):
                  bias=True,
                  img3D=False,
                  use_global_stats=True,
+                 epsilon=1e-5,
                  moving_average_fraction=0.9,
                  batch_norm_type=None,
                  mean_var_names=None,
@@ -2482,6 +2469,9 @@ class BatchNormLayer(LayerBase):
             self.config.use_global_stats = use_global_stats
         if moving_average_fraction is not None:
             self.config.moving_average_fraction = moving_average_fraction
+        if epsilon is not None:
+            assert epsilon >= 1e-5, "epsilon must be no less than 1e-5."
+            self.config.epsilon = epsilon
 
         input_layer = self.get_input_layer(0)
         image_conf = self.config.inputs[0].image_conf
@@ -2714,7 +2704,7 @@ Usage:
              max_sort_size = -1, inputs = ["output", "score"])
 
   Input data: Samples of the same query should be loaded as a sequence,
-          by ProtoDataProvider or PyDataProvider etc.. User should provide
+          by PyDataProvider etc.. User should provide
           scores for each sample. The score slot should be the 2nd
           input of lambdaRank layer.
 
@@ -2816,19 +2806,18 @@ class AddToLayer(LayerBase):
             name, self.layer_type, 0, inputs=inputs, **xargs)
         config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
 
-        if len(self.inputs) > 1:
-            for input_index in xrange(len(self.inputs)):
-                assert self.get_input_layer(0).height == self.get_input_layer(
-                    input_index).height
-                assert self.get_input_layer(0).width == self.get_input_layer(
-                    input_index).width
-                assert self.get_input_layer(0).depth == self.get_input_layer(
-                    input_index).depth
+        layer_size = self.get_input_layer(0).size
+        # To reserve heght, width, depth.
+        layer_with_hwc = self.get_input_layer(0)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            assert layer_size == input_layer.size
+            if input_layer.height and input_layer.height and input_layer.height:
+                layer_with_hwc = input_layer
 
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                        self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
+        self.set_layer_size(layer_with_hwc.size)
+        self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
+        self.set_layer_depth(layer_with_hwc.depth)
         self.create_bias_parameter(bias, self.config.size)
 
 
@@ -3209,6 +3198,18 @@ class SubNestedSequenceLayer(LayerBase):
         self.set_layer_size(size)
 
 
+@config_layer('dot_prod')
+class DotProdLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(DotProdLayer, self).__init__(
+            name, 'dot_prod', 0, inputs, device=device)
+        config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            "Two inputs should have the same size.")
+        self.set_layer_size(1)
+
+
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
     def __init__(self, name, inputs, device=None):
@@ -3330,6 +3331,20 @@ class RowL2NormLayer(LayerBase):
         self.set_layer_size(input_layer.size)
 
 
+@config_layer('cos')
+class CosSimLayer(LayerBase):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
+        super(CosSimLayer, self).__init__(
+            name, 'cos', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2,
+            'The CosSimLayer expects two and only two inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'The two inputs of CosSimLayer must have the same dimensionality.')
+        self.config.cos_scale = cos_scale
+
+
 @config_layer('cos_vm')
 class CosSimVecMatLayer(LayerBase):
     def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
@@ -3337,10 +3352,24 @@ class CosSimVecMatLayer(LayerBase):
             name, 'cos_vm', size, inputs=inputs, device=device)
         self.config.cos_scale = cos_scale
         config_assert(
-            len(self.inputs) == 2, 'CosSimVecMatLayer must have 2 inputs')
+            len(self.inputs) == 2, 'The CosSimVecMatLayer must have 2 inputs.')
         config_assert(
             size * self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'Wrong input size for CosSimVecMatLayer')
+            'Wrong input size for CosSimVecMatLayer.')
+
+
+@config_layer('l2_distance')
+class L2DistanceLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(L2DistanceLayer, self).__init__(
+            name, 'l2_distance', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2, ('The L2DistanceLayer must have '
+                                    'and only have 2 inputs.'))
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            ('Two inputs of the L2DistanceLayer must have '
+             'the same dimensionality.'))
 
 
 @config_layer('sampling_id')
@@ -3384,18 +3413,6 @@ class AverageLayer(LayerBase):
         self.create_bias_parameter(bias, self.config.size)
 
 
-@config_layer('cos')
-class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=1, device=None):
-        super(CosSimLayer, self).__init__(
-            name, 'cos', 1, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'inputs of CosSimLayer must have same dim')
-        self.config.cos_scale = cos_scale
-
-
 @config_layer('tensor')
 class TensorLayer(LayerBase):
     def __init__(self, name, size, inputs, bias=True, **xargs):
@@ -3506,11 +3523,17 @@ def ExpressionLayer(name, inputs, **xargs):
 
 @config_layer('concat')
 class ConcatenateLayer(LayerBase):
+    layer_type = 'concat'
+
     def __init__(self, name, inputs, bias=False, **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         config_assert(not bias, 'ConcatenateLayer cannot support bias.')
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_concat":
+            config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
+        self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
         super(ConcatenateLayer, self).__init__(
-            name, 'concat', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
         size = 0
         for input_index in xrange(len(self.inputs)):
             assert self.get_input_layer(0).height == self.get_input_layer(
@@ -3530,6 +3553,11 @@ class ConcatenateLayer(LayerBase):
         self.set_layer_size(size)
 
 
+@config_layer('mkldnn_concat')
+class MKLDNNConcatLayer(ConcatenateLayer):
+    layer_type = 'mkldnn_concat'
+
+
 # like concat layer, but each input layer was processed by a Projection.
 @config_layer('concat2')
 class ConcatenateLayer2(LayerBase):
@@ -3829,6 +3857,26 @@ class SwitchOrderLayer(LayerBase):
             name, 'switch_order', 0, inputs=inputs, **xargs)
         self.config.reshape_conf.height_axis.extend(reshape['height'])
         self.config.reshape_conf.width_axis.extend(reshape['width'])
+        input_layer = self.get_input_layer(0)
+        if reshape is None:
+            self.set_layer_size(input_layer.size)
+        else:
+            in_h = input_layer.height
+            in_w = input_layer.width
+            out_dims = None
+            if input_layer.has_depth():
+                in_d = input_layer.depth
+                in_c = input_layer.size / in_h / in_w / in_d
+                # batch_size, depth, height, width, channel
+                out_dims = [0, in_d, in_h, in_w, in_c]
+            else:
+                in_c = input_layer.size / in_h / in_w
+                # batch_size, height, width, channel
+                out_dims = [0, in_h, in_w, in_c]
+            # Because (reshape['width'][0] > 0) always be true.
+            # So out_dims[0] won't be used.
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            self.set_layer_size(size)
 
 
 @config_layer('scale_sub_region')
@@ -3850,6 +3898,21 @@ class ScaleSubRegionLayer(LayerBase):
                            image_conf.channels)
 
 
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, factor_size]
+        self.create_input_parameter(0, psize, dims)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index c749fa827fea4a808ab715dcb3442aa24d06a4d2..00efc01c0592107314f5b23c951706d039d49a88 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -17,7 +17,8 @@ __all__ = [
     "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
     'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
     "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
-    "LogActivation", "SqrtActivation", "ReciprocalActivation"
+    "LogActivation", "SqrtActivation", "ReciprocalActivation",
+    "SoftSignActivation"
 ]
 
 
@@ -243,8 +244,20 @@ class ReciprocalActivation(BaseActivation):
     Reciprocal Activation.
 
     .. math::
-       f(z) = 1/z
+       f(z)=\\frac{1}{z}
     """
 
     def __init__(self):
         BaseActivation.__init__(self, 'reciprocal', False)
+
+
+class SoftSignActivation(BaseActivation):
+    """
+    SoftSign Activation.
+
+    .. math::
+       f(z)=\\frac{z}{1 + |z|}
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'softsign', False)
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 57979db4de08989ab583b0ab41589c09789a0921..95797fba8f67bacb421f5c2813ad6332bc53cbc9 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -297,7 +297,7 @@ def auc_evaluator(
 def pnpair_evaluator(
         input,
         label,
-        info,
+        query_id,
         weight=None,
         name=None, ):
     """
@@ -308,16 +308,20 @@ def pnpair_evaluator(
 
     .. code-block:: python
 
-       eval = pnpair_evaluator(input, label, info)
+       eval = pnpair_evaluator(input, label, query_id)
 
     :param input: Input Layer name. The output prediction of network.
     :type input: LayerOutput
     :param label: Label layer name.
     :type label: LayerOutput
-    :param info: Info layer name. (TODO, explaination)
-    :type info: LayerOutput
+    :param query_id: Query_id layer name. Query_id indicates that which query
+     each sample belongs to. Its shape should be
+     the same as output of Label layer.
+    :type query_id: LayerOutput
     :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
+                  [sample_num, 1] which indicates the weight of each sample.
+                  The default weight of sample is 1 if the weight layer is None.
+                  And the pair weight is the mean of the two samples' weight.
     :type weight: LayerOutput
     :param name: Evaluator name.
     :type name: None|basestring
@@ -326,8 +330,8 @@ def pnpair_evaluator(
         input = [input]
     if label:
         input.append(label)
-    if info:
-        input.append(info)
+    if query_id:
+        input.append(query_id)
     evaluator_base(
         input=input,
         type="pnpair",
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index a02eba007ddf929ff92df995df253f5a386bac7b..f6dc58b9c0ed0b14ad9db098892af14274aed0c1 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -51,6 +51,7 @@ __all__ = [
     'last_seq',
     'first_seq',
     'cos_sim',
+    'l2_distance_layer',
     'hsigmoid',
     'conv_projection',
     'square_error_cost',
@@ -115,6 +116,7 @@ __all__ = [
     'huber_classification_cost',
     'block_expand_layer',
     'maxout_layer',
+    'dot_prod_layer',
     'out_prod_layer',
     'printer_layer',
     'print_layer',
@@ -146,6 +148,7 @@ __all__ = [
     'resize_layer',
     'sub_seq_layer',
     'scale_sub_region_layer',
+    'factorization_machine',
 ]
 
 
@@ -167,6 +170,7 @@ class LayerType(object):
     COST = 'cost'
     COSINE_SIM_VEC = 'cos_vm'
     COSINE_SIM = 'cos'
+    L2_DISTANCE = 'l2_distance'
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = 'conv'
     CONVTRANS_LAYER = 'convt'
@@ -197,6 +201,7 @@ class LayerType(object):
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
     ROTATE_LAYER = 'rotate'
+    DOT_PROD_LAYER = 'dot_prod'
     OUT_PROD_LAYER = 'out_prod'
     FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
@@ -260,6 +265,8 @@ class LayerType(object):
 
     SCALE_SUB_REGION_LAYER = 'scale_sub_region'
 
+    FACTORIZATION_MACHINE = 'factorization_machine'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -1896,9 +1903,12 @@ def repeat_layer(input,
     A layer for repeating the input for num_repeats times.
 
     If as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+
     If not as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
 
@@ -1911,19 +1921,19 @@ def repeat_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_repeats: Repeat the input so many times
+    :param num_repeats: The times of repeating the input.
     :type num_repeats: int
     :param name: The name of this layer. It is optional.
-    :param as_row_vector: True for treating input as row vector and repeating
-                          in the column direction.  This is equivalent to apply
-                          concat_layer() with num_repeats same input.
-                          False for treating input as column vector and repeating
-                          in the row direction.
+    :type name: basestring
+    :param as_row_vector: Whether to treat the input as row vectors or not. If
+                          the parameter is set to True, the repeating operation
+                          will be performed in the column direction. Otherwise,
+                          it will be performed in the row direction.
     :type as_row_vector: bool
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1970,13 +1980,14 @@ def seq_reshape_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reshape_size: the size of reshaped sequence.
+    :param reshape_size: The dimension of the reshaped sequence.
     :type reshape_size: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
@@ -2004,7 +2015,7 @@ def seq_reshape_layer(input,
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
     """
-    This layer is for linear interpolation with two inputs,
+    This layer performs linear interpolation on two inputs,
     which is used in NEURAL TURING MACHINE.
 
     .. math::
@@ -2026,7 +2037,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2060,7 +2072,7 @@ def bilinear_interp_layer(input,
                           name=None,
                           layer_attr=None):
     """
-    This layer is to implement bilinear interpolation on conv layer output.
+    This layer implements bilinear interpolation on convolutional layer's output.
 
     Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
 
@@ -2070,18 +2082,19 @@ def bilinear_interp_layer(input,
 
        bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
 
-    :param   input:        A input layer.
-    :type    input:        LayerOutput.
-    :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int | None
-    :param   out_size_y:   bilinear interpolation output height.
-    :type    out_size_y:   int | None
-    :param   name:         The layer's name, which cna not be specified.
-    :type    name:         None | basestring
-    :param   layer_attr:   Extra Layer attribute.
-    :type    layer_attr:   ExtraLayerAttribute
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param out_size_x: The width of the output.
+    :type out_size_x: int
+    :param out_size_y: The height of the output.
+    :type out_size_y: int
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype:  LayerOutput
+    :rtype: LayerOutput
     """
     assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
@@ -2116,8 +2129,8 @@ def power_layer(input, weight, name=None, layer_attr=None):
     .. math::
        y = x^w
 
-    where :math:`x` is a input vector, :math:`w` is scalar weight,
-    and :math:`y` is a output vector.
+    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
+    and :math:`y` is an output vector.
 
     The example usage is:
 
@@ -2127,11 +2140,12 @@ def power_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The exponent of the power.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2171,11 +2185,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The weight of each sample.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2213,7 +2228,8 @@ def trans_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2249,11 +2265,14 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param height: The height of the sample matrix
+    :param height: The height of the sample matrix.
     :type height: int
+    :param width: The width of the sample matrix.
+    :type width: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2298,15 +2317,15 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input layer a
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: scale for cosine value. default is 5.
+    :param scale: The scale of the cosine similarity. 1 is the default value.
     :type scale: float
-    :param size: layer size. NOTE size_a * size should equal size_b.
+    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
     :type size: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2332,6 +2351,51 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b], size=size)
 
 
+@wrap_name_default()
+@layer_support()
+def l2_distance_layer(x, y, name=None, layer_attr=None):
+    """
+    This layer calculates and returns the Euclidean distance between two input
+    vectors x and y. The equation is as follows:
+
+    ..  math::
+        l2_distance(\\mathbf{x}, \\mathbf{y}) = \\sqrt{\\sum_{i=1}^D(x_i - y_i)}
+
+    The output size of this layer is fixed to be 1. Note that the above
+    computation is for one sample. Multiple samples are processed in one batch.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       l2_sim = l2_distance(x=layer1, y=layer2)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param x: The first input x for this layer, whose output is a matrix with
+              dimensionality N x D. N is the sample number in a mini-batch.
+              D is the dimensionality of x's output.
+    :type x: LayerOutput
+    :param y: The second input y for this layer, whose output is a matrix with
+              dimensionality N x D. N is the sample number in a mini-batch.
+              D is the dimensionality of y's output.
+    :type y: LayerOutput
+    :param layer_attr: The extra layer attributes, for example, drop rate.
+                       See ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute
+    :return: The returned LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(x, LayerOutput) and isinstance(y, LayerOutput)
+    Layer(
+        name=name,
+        type=LayerType.L2_DISTANCE,
+        inputs=[x.name, y.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.L2_DISTANCE, parents=[x, y], size=1)
+
+
 @wrap_name_default()
 @wrap_bias_attr_default(has_bias=True)
 @wrap_param_attr_default()
@@ -2346,8 +2410,10 @@ def hsigmoid(input,
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
-    This idea is from "F. Morin, Y. Bengio (AISTATS 05):
-    Hierarchical Probabilistic Neural Network Language Model."
+
+    Reference:
+        `Hierarchical Probabilistic Neural Network Language Model
+        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
 
     The example usage is:
 
@@ -2358,19 +2424,21 @@ def hsigmoid(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param label: Label layer.
+    :param label: The input label.
     :type label: LayerOutput
-    :param num_classes: number of classes.
-    :type num_classes: int | None
+    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
+                        is not set or set to None, its actual value will be automatically set to
+                        the number of labels.
+    :type num_classes: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute. None means default parameter.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra Layer Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2458,12 +2526,12 @@ def img_conv_layer(input,
     input is raw pixels of image(mono or RGB), or it may be the previous layer's
     num_filters * num_group.
 
-    There are several group of filter in PaddlePaddle implementation.
-    Each group will process some channel of the inputs. For example, if an input
+    There are several groups of filters in PaddlePaddle implementation.
+    Each group will process some channels of the input. For example, if
     num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create
-    32*4 = 128 filters to process inputs. The channels will be split into 4
-    pieces. First 256/4 = 64 channels will process by first 32 filters. The
-    rest channels will be processed by rest group of filters.
+    32*4 = 128 filters to process the input. The channels will be split into 4
+    pieces. First 256/4 = 64 channels will be processed by first 32 filters. The
+    rest channels will be processed by the rest groups of filters.
 
     The example usage is:
 
@@ -2479,53 +2547,68 @@ def img_conv_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
-                        two image dimension.
+    :param filter_size: The dimensions of the filter kernel. If the parameter is
+                        set to one integer, the two dimensions on x and y axises
+                        will be same when filter_size_y is not set. If it is set
+                        to a list, the first element indicates the dimension on
+                        the x axis, and the second is used to specify the dimension
+                        on the y axis when filter_size_y is not provided.
     :type filter_size: int | tuple | list
-    :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
-                        currently supports rectangular filters, the filter's
-                        shape will be (filter_size, filter_size_y).
-    :type filter_size_y: int | None
+    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
+                          is not set, it will be set automatically according to filter_size.
+    :type filter_size_y: int
     :param num_filters: Each filter group's number of filter
     :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
-    :param groups: Group size of filters.
+    :param groups: The group number. 1 is the default group number.
     :type groups: int
-    :param stride: The x dimension of the stride. Or input a tuple for two image
-                   dimension.
+    :param stride: The strides. If the parameter is set to one integer, the strides
+                   on x and y axises will be same when stride_y is not set. If it is
+                   set to a list, the first element indicates the stride on the x axis,
+                   and the second is used to specify the stride on the y axis when
+                   stride_y is not provided. 1 is the default value.
     :type stride: int | tuple | list
-    :param stride_y: The y dimension of the stride.
+    :param stride_y: The stride on the y axis.
     :type stride_y: int
-    :param padding: The x dimension of the padding. Or input a tuple for two
-                    image dimension
+    :param padding: The padding sizes. If the parameter is set to one integer, the padding
+                    sizes on x and y axises will be same when padding_y is not set. If it
+                    is set to a list, the first element indicates the padding size on the
+                    x axis, and the second is used to specify the padding size on the y axis
+                    when padding_y is not provided. 0 is the default padding size.
     :type padding: int | tuple | list
-    :param padding_y: The y dimension of the padding.
+    :param padding_y: The padding size on the y axis.
     :type padding_y: int
-    :param dilation: The x dimension of the dilation. Or input a tuple for two
-                    image dimension
+    :param dilation: The dimensions of the dilation. If the parameter is set to one integer,
+                     the two dimensions on x and y axises will be same when dilation_y is not
+                     set. If it is set to a list, the first element indicates the dimension
+                     on the x axis, and the second is used to specify the dimension on the y
+                     axis when dilation_y is not provided. 1 is the default dimension.
     :type dilation: int | tuple | list
-    :param dilation_y: The y dimension of the dilation.
+    :param dilation_y: The dimension of the dilation on the y axis.
     :type dilation_y: int
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: number of input channels. If None will be set
-                        automatically from previous output.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channel number of the input.
     :type num_channels: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param shared_biases: Is biases will be shared between filters or not.
+    :param shared_biases: Whether biases will be shared between filters or not.
     :type shared_biases: bool
-    :param layer_attr: Layer Extra Attribute.
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :param trans: True if it is a convTransLayer, False if it is a convLayer
     :type trans: bool
-    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or
-                       "cudnn_conv"
-    :type layer_type: String
+    :param layer_type: Specify the layer type. If the dilation's dimension on one axis is
+                       larger than 1, layer_type has to be "cudnn_conv" or "cudnn_convt".
+                       If trans=True, layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or "cudnn_conv".
+    :type layer_type: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2630,7 +2713,7 @@ def img_pool_layer(input,
     """
     Image pooling Layer.
 
-    The details of pooling layer, please refer ufldl's pooling_ .
+    The details of pooling layer, please refer to ufldl's pooling_ .
 
     .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
 
@@ -2662,32 +2745,37 @@ def img_pool_layer(input,
                                  padding_y=2,
                                  pool_type=MaxPooling())
 
-    :param padding: pooling padding width.
+    :param padding: The padding size on the x axis. 0 is the default padding size.
     :type padding: int
-    :param padding_y: pooling padding height. It's equal to padding by default.
-    :type padding_y: int | None
-    :param name: name of pooling layer
-    :type name: basestring.
+    :param padding_y: The padding size on the y axis. If the parameter is not set
+                      or set to None, it will be set to 'padding' automatically.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param pool_size: pooling window width
+    :param pool_size: The pooling window length on the x axis.
     :type pool_size: int
-    :param pool_size_y: pooling window height. It's eaqual to pool_size by default.
-    :type pool_size_y: int | None
-    :param num_channels: number of input channel.
+    :param pool_size_y: The pooling window length on the y axis. If the parameter is
+                        not set or set to None, its actual value will be automatically
+                        set to pool_size.
+    :type pool_size_y: int
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
-                      MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
     :type pool_type: BasePoolingType
-    :param stride: stride width of pooling.
+    :param stride: The stride on the x axis. 1 is the default value.
     :type stride: int
-    :param stride_y: stride height of pooling. It is equal to stride by default.
-    :type stride_y: int | None
-    :param layer_attr: Extra Layer attribute.
+    :param stride_y: The stride on the y axis. If the parameter is not set or set to
+                     None, its actual value will be automatically set to 'stride'.
+    :type stride_y: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
-                      Defalut is True. If set false, Otherwise use floor.
-
+    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+                      True is the default. If it is set to False, the floor function will
+                      be used.
     :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2793,24 +2881,32 @@ def img_pool3d_layer(input,
 
     :param padding: pooling padding width.
     :type padding: int | tuple | list
-    :param name: name of pooling layer
+    :param name: The name of this layer. It is optional.
     :type name: basestring.
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param pool_size: pooling window width
+    :param pool_size: The pooling window lengths along three axises. If the parameter
+                      is set to one integer, the three lengths will be same.
     :type pool_size: int | tuple | list
-    :param num_channels: number of input channel.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
-                      MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
     :type pool_type: BasePoolingType
-    :param stride: stride width of pooling.
+    :param stride: The strides of the pooling along three axises. If the parameter
+                   is set to one integer, the three strides will be same. 1 is the
+                   default value.
     :type stride: int | tuple | list
-    :param layer_attr: Extra Layer attribute.
+    :param padding: The sizes of padding along three axises. If the parameter is set to
+                    one integer, they will be same. 0 is the default padding size.
+    :type padding: int | tuple | list
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
-                      Defalut is True. If set false, Otherwise use floor.
-
+    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+                      True is the default. If it is set to False, the floor function will
+                      be used.
     :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2889,9 +2985,11 @@ def spp_layer(input,
               pyramid_height=None,
               layer_attr=None):
     """
-    Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
-    The details please refer to
-    `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
+    A layer performs spatial pyramid pooling.
+
+    Reference:
+        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
+        https://arxiv.org/abs/1406.4729`_
 
     The example usage is:
 
@@ -2906,13 +3004,16 @@ def spp_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_channels: number of input channel.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling or AveragePooling. Default is MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
     :type scale: BasePoolingType
-    :param pyramid_height: pyramid height.
+    :param pyramid_height: The pyramid height of this pooling.
     :type pyramid_height: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2989,8 +3090,8 @@ def img_cmrnorm_layer(input,
     Response normalization across feature maps.
 
     Reference:
-        ImageNet Classification with Deep Convolutional Neural Networks
-        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
+        `ImageNet Classification with Deep Convolutional Neural Networks
+        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
 
     The example usage is:
 
@@ -3036,6 +3137,7 @@ def batch_norm_layer(input,
                      param_attr=None,
                      layer_attr=None,
                      batch_norm_type=None,
+                     epsilon=1e-5,
                      moving_average_fraction=0.9,
                      use_global_stats=None,
                      mean_var_names=None):
@@ -3055,9 +3157,9 @@ def batch_norm_layer(input,
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
     Reference:
-        Batch Normalization: Accelerating Deep Network Training by Reducing
+        `Batch Normalization: Accelerating Deep Network Training by Reducing
         Internal Covariate Shift
-        http://arxiv.org/abs/1502.03167
+        http://arxiv.org/abs/1502.03167`_
 
     The example usage is:
 
@@ -3106,6 +3208,8 @@ def batch_norm_layer(input,
                              will use the mean and variance of the current batch
                              of test data.
     :type use_global_stats: bool | None.
+    :param epsilon: The small constant added to the variance to improve numeric stability.
+    :type epsilon: float.
     :param moving_average_fraction: Factor used in the moving average computation.
                                    :math:`runningMean = newMean*(1-factor) + runningMean*factor`
     :type moving_average_fraction: float.
@@ -3123,6 +3227,7 @@ def batch_norm_layer(input,
     assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
            (batch_norm_type == "mkldnn_batch_norm") or \
            (batch_norm_type == "cudnn_batch_norm")
+
     l = Layer(
         name=name,
         img3D=img3D,
@@ -3132,6 +3237,7 @@ def batch_norm_layer(input,
         type=LayerType.BATCH_NORM_LAYER,
         batch_norm_type=batch_norm_type,
         bias=ParamAttr.to_bias(bias_attr),
+        epsilon=epsilon,
         moving_average_fraction=moving_average_fraction,
         use_global_stats=use_global_stats,
         mean_var_names=mean_var_names,
@@ -3871,7 +3977,7 @@ def recurrent_layer(input,
     :type input: LayerOutput
     :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to 
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
                       False or an object whose type is not ParameterAttribute,
                       no bias is defined. If the parameter is set to True,
                       the bias is initialized to zero.
@@ -4140,6 +4246,45 @@ def maxid_layer(input, name=None, layer_attr=None):
         size=l.config.size)
 
 
+@wrap_name_default()
+def dot_prod_layer(input1, input2, name=None, layer_attr=None):
+    """
+    A layer for computing the dot product of two vectors.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input1: The first input layer.
+    :type input1: LayerOutput
+    :param input2: The second input layer.
+    :type input2: LayerOutput
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input1, LayerOutput)
+    assert isinstance(input2, LayerOutput)
+    assert input1.size == input2.size, ("Two inputs should have the same size.")
+
+    l = Layer(
+        name=name,
+        type=LayerType.DOT_PROD_LAYER,
+        inputs=[input1.name, input2.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.DOT_PROD_LAYER,
+        parents=[input1, input2],
+        size=l.config.size)
+
+
 @wrap_name_default()
 def out_prod_layer(input1, input2, name=None, layer_attr=None):
     """
@@ -4606,7 +4751,7 @@ def conv_projection(input,
                         will be same when filter_size_y is not set. If it is set
                         to a list, the first element indicates the dimension on
                         the x axis, and the second is used to specify the dimension
-                        on the y axis when filter_size is not provided.
+                        on the y axis when filter_size_y is not provided.
     :type filter_size: int | tuple | list
     :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
                           is not set, it will be set automatically according to filter_size.
@@ -5271,10 +5416,10 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     to be devided by groups.
 
     Reference:
-        Maxout Networks
-        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-        Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        https://arxiv.org/pdf/1312.6082v4.pdf
+        `Maxout Networks
+        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
+        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
+        https://arxiv.org/pdf/1312.6082v4.pdf`_
 
     .. math::
        y_{si+j} = \max_k x_{gsi + sk + j}
@@ -5339,9 +5484,9 @@ def ctc_layer(input,
     alignment between the inputs and the target labels is unknown.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
         Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
@@ -5413,9 +5558,9 @@ def warp_ctc_layer(input,
     install it to :code:`third_party/install/warpctc` directory.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
         - Let num_classes represents the category number. Considering the 'blank'
@@ -5635,8 +5780,8 @@ def nce_layer(input,
     Noise-contrastive estimation.
 
     Reference:
-        A fast and simple algorithm for training neural probabilistic language
-        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
+        `A fast and simple algorithm for training neural probabilistic language
+        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
 
     The example usage is:
 
@@ -5751,8 +5896,8 @@ def rank_cost(left,
     A cost Layer for learning to rank using gradient descent.
 
     Reference:
-        Learning to Rank using Gradient Descent
-        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf
+        `Learning to Rank using Gradient Descent
+        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
 
     .. math::
 
@@ -6287,8 +6432,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
         smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
 
     Reference:
-        Fast R-CNN
-        https://arxiv.org/pdf/1504.08083v2.pdf
+        `Fast R-CNN
+        https://arxiv.org/pdf/1504.08083v2.pdf`_
 
     The example usage is:
 
@@ -6483,18 +6628,19 @@ def row_conv_layer(input,
 
 @layer_support()
 @wrap_name_default()
-@wrap_param_attr_default()
 def prelu_layer(input,
                 name=None,
                 partial_sum=1,
+                channel_shared=None,
+                num_channels=None,
                 param_attr=None,
                 layer_attr=None):
     """
     The Parametric Relu activation that actives outputs with a learnable weight.
 
     Reference:
-        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
 
     .. math::
        z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6517,6 +6663,14 @@ def prelu_layer(input,
         - partial_sum = number of outputs, indicates all elements share the same weight.
 
     :type partial_sum: int
+    :param channel_shared: whether or not the parameter are shared across channels.
+
+        - channel_shared = True, we set the partial_sum to the number of outputs.
+        - channel_shared = False, we set the partial_sum to the number of elements in one channel.
+
+    :type channel_shared: bool
+    :param num_channels: number of input channel.
+    :type num_channels: int
     :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
@@ -6527,7 +6681,25 @@ def prelu_layer(input,
     """
 
     assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
-    assert isinstance(param_attr, ParameterAttribute)
+
+    if not param_attr:
+        param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0)
+    else:
+        assert isinstance(param_attr, ParameterAttribute)
+
+    if num_channels is None:
+        assert input.num_filters is not None, \
+                'the input channel cannot be detected, please specify the num_channels parameter'
+        num_channels = input.num_filters
+
+    if channel_shared is not None:
+        assert isinstance(channel_shared, bool)
+        assert (input.height != 0 and input.width != 0), \
+            'input height and widht must be setted'
+        if channel_shared:
+            partial_sum = input.height * input.width * num_channels
+        else:
+            partial_sum = input.height * input.width
 
     l = Layer(
         name=name,
@@ -6539,6 +6711,7 @@ def prelu_layer(input,
         name=name,
         layer_type=LayerType.PRELU,
         parents=input,
+        num_filters=num_channels,
         size=l.config.size)
 
 
@@ -6563,8 +6736,8 @@ def gated_unit_layer(input,
     product between :match:`X'` and :math:`\sigma` is finally returned.
 
     Reference:
-        Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083
+        `Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083`_
 
     .. math::
        y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
@@ -6700,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
 
     :param input: The input of this layer. If two inputs are given, the second one
                   will be regarded as the reference.
+                  And the input must be 4-dims and in NCHW order.
     :type input: LayerOutput | Sequence
     :param offset: The crop offset.
     :type offset: Sequence
@@ -6988,7 +7162,7 @@ def img_conv3d_layer(input,
     :type layer_attr: ExtraLayerAttribute
     :param trans: True if it is a convTransLayer, False if it is a convLayer
     :type trans: bool
-    :param layer_type: Specify the layer_type. If the parameter is set, it must be "deconv3d"
+    :param layer_type: Specify the layer type. If the parameter is set, it must be "deconv3d"
                        when trans=True. If not set, it will be automatically set to "deconv3d"
                        when trans=True and "conv3d" when trans=False.
     :type layer_type: basestring
@@ -7233,3 +7407,73 @@ def scale_sub_region_layer(input, indices, value, name=None):
         parents=[input, indices],
         num_filters=input.num_filters,
         size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
+    .. math::
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+    Factorization machines.
+
+    .. code-block:: python
+        first_order = paddle.layer.fc(input=input,
+                                      size=1,
+                                      act=paddle.activation.Linear())
+        second_order = paddle.layer.factorization_machine(input=input,
+                                                          factor_size=10)
+        fm = paddle.layer.addto(input=[first_order, second_order],
+                                act=paddle.activation.Linear(),
+                                bias_attr=False)
+
+    :param input: The input layer. Supported input types: all input data types
+                  on CPU, and only dense input types on GPU.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index d323d34c3ff47614342934c2a02492f66d27dc10..9776ae18057d57dd994fac8b62090258252922c6 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import math
 
 from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
     IdentityActivation, TanhActivation, SequenceSoftmaxActivation
@@ -26,9 +26,9 @@ __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
     "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
     'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'dot_product_attention', 'simple_gru2',
-    'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
-    'outputs'
+    'simple_attention', 'dot_product_attention', 'multi_head_attention',
+    'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm',
+    'inputs', 'outputs'
 ]
 
 ######################################################
@@ -1476,10 +1476,8 @@ def dot_product_attention(encoded_sequence,
         expand_as=encoded_sequence,
         name='%s_expand' % name)
 
-    m = linear_comb_layer(
-        weights=expanded,
-        vectors=encoded_sequence,
-        name='%s_dot-product' % name)
+    m = dot_prod_layer(
+        input1=expanded, input2=encoded_sequence, name='%s_dot-product' % name)
 
     attention_weight = fc_layer(
         input=m,
@@ -1498,6 +1496,134 @@ def dot_product_attention(encoded_sequence,
         input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 
 
+@wrap_name_default()
+def multi_head_attention(query,
+                         key,
+                         value,
+                         key_proj_size,
+                         value_proj_size,
+                         head_num,
+                         attention_type,
+                         softmax_param_attr=None,
+                         name=None):
+    """
+    Calculate and return a context vector with dot-product attention mechanism.
+    The dimension of the context vector equals to value_proj_size * head_num.
+
+    Please refer to **Attention Is All You Need** for more details. The link is
+    as follows:
+    https://arxiv.org/abs/1706.03762.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        context = multi_head_attention(query=decoder_state,
+                                       key=enc_seq,
+                                       value=enc_seq,
+                                       key_proj_size=64,
+                                       value_pro_size=64,
+                                       head_num=8,
+                                       attention_type='dot-product attention')
+
+    :param name: A prefix attached to the name of each layer that defined inside
+                 the multi_head_attention.
+    :type name: basestring
+    :param softmax_param_attr: The parameter attribute of sequence softmax
+                               that is used to produce attention weight.
+    :type softmax_param_attr: ParameterAttribute
+    :param query: query is used to calculate attention weights over values at current step.
+    :type query: LayerOutput
+    :param key: key is used to calculate the attention weight of the corresponding value.
+    :type key: LayerOutput
+    :param value: value is the sequence to be attended.
+    :type value: LayerOutput
+    :param key_proj_size: The dimension of the linear projection performed on key and query.
+    :type key_proj_size: int
+    :param value_proj_size: The dimension of the linear projection performed on value.
+    :type value_proj_size: int
+    :param head_num: The number of attention heads.
+    :type head_num: int
+    :param attention_type: The type of the attention mechanism used in each attention
+                           heads. Now, we only support scaled dot-product attention and
+                           additive attention.
+    :type attention_type: basestring
+    :return: The context vector.
+    :rtype: LayerOutput
+    """
+    assert attention_type in ['dot-product attention', 'additive attention']
+
+    with mixed_layer(
+            size=key_proj_size * head_num,
+            name='%s_query_proj' % name) as query_proj:
+        query_proj += full_matrix_projection(query)
+    query_proj = expand_layer(input=query_proj, expand_as=key)
+
+    with mixed_layer(
+            size=key_proj_size * head_num,
+            name='%s_key_proj' % name) as key_proj:
+        key_proj += full_matrix_projection(key)
+
+    with mixed_layer(
+            size=value_proj_size * head_num,
+            name='%s_value_proj' % name) as value_proj:
+        value_proj += full_matrix_projection(value)
+
+    head_list = []
+    for i in range(head_num):
+        with mixed_layer(size=key_proj_size) as sub_query_proj:
+            sub_query_proj += identity_projection(
+                query_proj, offset=key_proj_size * i, size=key_proj_size)
+
+        with mixed_layer(size=key_proj_size) as sub_key_proj:
+            sub_key_proj += identity_projection(
+                key_proj, offset=key_proj_size * i, size=key_proj_size)
+
+        with mixed_layer(size=value_proj_size) as sub_value_proj:
+            sub_value_proj += identity_projection(
+                value_proj, offset=value_proj_size * i, size=value_proj_size)
+
+        if attention_type == 'dot-product attention':
+            m = dot_prod_layer(
+                input1=sub_query_proj,
+                input2=sub_key_proj,
+                name='%s_dot-product_%d' % (name, i))
+            m = slope_intercept_layer(
+                input=m,
+                slope=math.sqrt(1.0 / key_proj_size),
+                name='%s_dot-product_scaling_%d' % (name, i))
+        else:
+            with mixed_layer(
+                    size=key_proj_size,
+                    act=TanhActivation(),
+                    name='%s_combine_%d' % (name, i)) as m:
+                m += identity_projection(sub_query_proj)
+                m += identity_projection(sub_key_proj)
+
+        attention_weight = fc_layer(
+            input=m,
+            size=1,
+            act=SequenceSoftmaxActivation(),
+            param_attr=softmax_param_attr,
+            name="%s_softmax_%d" % (name, i),
+            bias_attr=False)
+
+        scaled = scaling_layer(
+            weight=attention_weight,
+            input=sub_value_proj,
+            name='%s_scaling_%d' % (name, i))
+        head = pooling_layer(
+            input=scaled,
+            pooling_type=SumPooling(),
+            name="%s_pooling_%d" % (name, i))
+
+        head_list.append(head)
+
+    attended = concat_layer(head_list)
+
+    return attended
+
+
 def inputs(layers, *args):
     """
     Declare the inputs of network. The order of input should be as same as
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 1c7451e0abf5dc1b99671f292e2ffc2d2282abe9..10c941f707498ec45e79bed9d3f8054eea19887d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,8 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
+test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
+test_factorization_machine)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
index b14121e82cb7d9516c4771fc896b9b3b9e01d1c8..3e0f957648879d4350d662b336c953273bac1378 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -65,6 +65,7 @@ layers {
   height: 227
   width: 227
   depth: 1
+  epsilon: 1e-05
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index c7a487a11231cba6182b654108773037bdb0ec35..a18a4652e14c0cfc4dbca87e67d31aa663ee756b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -65,6 +65,7 @@ layers {
   height: 256
   width: 256
   depth: 1
+  epsilon: 1e-05
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
index 832ed24a31dd2bedba9a4fce77d7a088d1796fdb..9b69ae4a3b3cbcc7c0c69a2d5b3728e2f0204f33 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
@@ -36,6 +36,7 @@ layers {
   height: 6
   width: 20
   depth: 3
+  epsilon: 1e-05
 }
 parameters {
   name: "___batch_norm_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f1530c382c3d81a82592af2c43c06eb4278e2b4a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
@@ -0,0 +1,38 @@
+type: "nn"
+layers {
+  name: "vector1"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "vector2"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__dot_prod_layer_0__"
+  type: "dot_prod"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "vector1"
+  }
+  inputs {
+    input_layer_name: "vector2"
+  }
+}
+input_layer_names: "vector1"
+input_layer_names: "vector2"
+output_layer_names: "__dot_prod_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "vector1"
+  layer_names: "vector2"
+  layer_names: "__dot_prod_layer_0__"
+  input_layer_names: "vector1"
+  input_layer_names: "vector2"
+  output_layer_names: "__dot_prod_layer_0__"
+  is_recurrent_layer_group: false
+}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..4f3002b19942ed58970bfd64e5978c1601273992
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "__factorization_machine_0__"
+  type: "factorization_machine"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___factorization_machine_0__.w0"
+  }
+  factor_size: 10
+}
+parameters {
+  name: "___factorization_machine_0__.w0"
+  size: 10240
+  initial_mean: 0.0
+  initial_std: 0.03125
+  dims: 1024
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__factorization_machine_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__factorization_machine_0__"
+  input_layer_names: "data"
+  output_layer_names: "__factorization_machine_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..9ba33689edc893c2169a73679a04a6f51cfc83a8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "x"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "y"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__l2_distance_layer_0__"
+  type: "l2_distance"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "x"
+  }
+  inputs {
+    input_layer_name: "y"
+  }
+}
+input_layer_names: "x"
+input_layer_names: "y"
+output_layer_names: "__l2_distance_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "x"
+  layer_names: "y"
+  layer_names: "__l2_distance_layer_0__"
+  input_layer_names: "x"
+  input_layer_names: "y"
+  output_layer_names: "__l2_distance_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
index 94ad56cab063df9e6a11bb1c293727fb9dec810f..63fb38c6508675d379f577b965ea17ad4c3b4942 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
@@ -4,6 +4,8 @@ layers {
   type: "data"
   size: 300
   active_type: ""
+  height: 10
+  width: 10
 }
 layers {
   name: "__prelu_layer_0__"
@@ -15,6 +17,9 @@ layers {
     input_parameter_name: "___prelu_layer_0__.w0"
   }
   partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
 }
 layers {
   name: "__prelu_layer_1__"
@@ -26,6 +31,9 @@ layers {
     input_parameter_name: "___prelu_layer_1__.w0"
   }
   partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
 }
 layers {
   name: "__prelu_layer_2__"
@@ -37,41 +45,100 @@ layers {
     input_parameter_name: "___prelu_layer_2__.w0"
   }
   partial_sum: 5
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_3__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_3__.w0"
+  }
+  partial_sum: 300
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_4__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_4__.w0"
+  }
+  partial_sum: 100
+  height: 10
+  width: 10
+  depth: 1
 }
 parameters {
   name: "___prelu_layer_0__.w0"
   size: 300
-  initial_mean: 0.0
-  initial_std: 0.057735026919
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 parameters {
   name: "___prelu_layer_1__.w0"
   size: 300
-  initial_mean: 0.0
-  initial_std: 0.057735026919
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 parameters {
   name: "___prelu_layer_2__.w0"
   size: 60
-  initial_mean: 0.0
-  initial_std: 0.129099444874
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 60
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_3__.w0"
+  size: 1
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_4__.w0"
+  size: 3
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 3
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 input_layer_names: "input"
-output_layer_names: "__prelu_layer_2__"
+output_layer_names: "__prelu_layer_4__"
 sub_models {
   name: "root"
   layer_names: "input"
   layer_names: "__prelu_layer_0__"
   layer_names: "__prelu_layer_1__"
   layer_names: "__prelu_layer_2__"
+  layer_names: "__prelu_layer_3__"
+  layer_names: "__prelu_layer_4__"
   input_layer_names: "input"
-  output_layer_names: "__prelu_layer_2__"
+  output_layer_names: "__prelu_layer_4__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e52d48dde0084aacd3f7874cc384d59287a0c7d5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+vec1 = data_layer(name='vector1', size=10)
+vec2 = data_layer(name='vector2', size=10)
+dot_product = dot_prod_layer(input1=vec1, input2=vec2)
+
+outputs(dot_product)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b249de0fee3c8ca4ad0520872fa2497c493d31b5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=1024)
+
+fm = factorization_machine(input=data, factor_size=10)
+
+outputs(fm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36a5c6d1222860ee4b77f89ad4b6148ccd89589
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+outputs(
+    l2_distance_layer(
+        x=data_layer(
+            name='x', size=128), y=data_layer(
+                name='y', size=128)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
index aae90fab32db78a70c2169ed8fafb930433f4136..45b02fbf325bb63b057bbbf64d59af8debf0bc9d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
@@ -1,8 +1,10 @@
 from paddle.trainer_config_helpers import *
 
-data = data_layer(name='input', size=300)
-prelu = prelu_layer(input=data)
-prelu = prelu_layer(input=data, partial_sum=1)
-prelu = prelu_layer(input=data, partial_sum=5)
+data = data_layer(name='input', size=300, height=10, width=10)
+prelu = prelu_layer(input=data, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=1, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=5, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=True, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=False, num_channels=3)
 
 outputs(prelu)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 7bbe3eaaa67a117bc53571e6571365c3a26814c1..70f61e84997efdbe3d6f268d249be8bac15b9ecd 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -62,21 +62,15 @@ __all__ = [
 cp.begin_parse()
 
 
-def init(**kwargs):
-    import py_paddle.swig_paddle as api
-    args = []
-    args_dict = {}
-    # NOTE: append arguments if they are in ENV
-    for ek, ev in os.environ.iteritems():
-        if ek.startswith("PADDLE_INIT_"):
-            args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev)
+def set_omp_mkl_env_vars(trainer_count):
+    '''Auto set CPU environment if have not set before.
+       export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
+       export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
+    '''
+    import platform
+    if not platform.system() in ['Linux', 'Darwin']:
+        return
 
-    args_dict.update(kwargs)
-    # NOTE: overwrite arguments from ENV if it is in kwargs
-    for key in args_dict.keys():
-        args.append('--%s=%s' % (key, str(args_dict[key])))
-
-    # auto set cpu environment
     def set_env(key, value):
         '''If the key has not been set in the environment, set it with value.'''
         assert isinstance(key, str)
@@ -85,22 +79,58 @@ def init(**kwargs):
         if envset is None:
             os.environ[key] = value
 
-    ht = os.popen("lscpu |grep \"per core\"|awk -F':' '{print $2}'|xargs")
-    ht = int(ht.read())
-    if ht == 1:  # ht is off
-        set_env("OMP_DYNAMIC", "false")
-        set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
-    else:
+    def num_physical_cores():
+        '''Get the number of physical cores'''
+        if platform.system() == "Linux":
+            num_sockets = int(
+                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
+                .read())
+            num_cores_per_socket = int(
+                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
+                .read())
+            return num_sockets * num_cores_per_socket
+        else:
+            cmds = {"Darwin": "sysctl -n hw.physicalcpu"}
+            return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    def num_logical_processors():
+        '''Get the number of logical processors'''
+        cmds = {
+            "Linux": "grep \"processor\" /proc/cpuinfo|sort -u|wc -l",
+            "Darwin": "sysctl -n hw.logicalcpu"
+        }
+        return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    num_cores = num_physical_cores()
+    num_processors = num_logical_processors()
+    if num_processors > num_cores:  # Hyper Threading is enabled
         set_env("OMP_DYNAMIC", "true")
         set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
-    processors = os.popen("grep \"processor\" /proc/cpuinfo|sort -u|wc -l")
-    processors = int(processors.read())
-    trainers = kwargs.get('trainer_count', 1)
-    threads = processors / trainers
+    else:
+        set_env("OMP_DYNAMIC", "false")
+        set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
+    threads = num_processors / trainer_count
     threads = '1' if threads < 1 else str(threads)
     set_env("OMP_NUM_THREADS", threads)
     set_env("MKL_NUM_THREADS", threads)
 
+
+def init(**kwargs):
+    import py_paddle.swig_paddle as api
+    args = []
+    args_dict = {}
+    # NOTE: append arguments if they are in ENV
+    for ek, ev in os.environ.iteritems():
+        if ek.startswith("PADDLE_INIT_"):
+            args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev)
+
+    args_dict.update(kwargs)
+    # NOTE: overwrite arguments from ENV if it is in kwargs
+    for key in args_dict.keys():
+        args.append('--%s=%s' % (key, str(args_dict[key])))
+
+    set_omp_mkl_env_vars(kwargs.get('trainer_count', 1))
+
     if 'use_gpu' in kwargs:
         cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
     if 'use_mkldnn' in kwargs:
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 98b97c75ca72f11c105535e0f2a5fa0201db5d42..f10bf7e42a1ead09b3eba0d61e55701215e4360f 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -38,6 +38,7 @@ UCI_TEST_DATA = None
 URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
 MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 
+
 def feature_range(maximums, minimums):
     import matplotlib
     matplotlib.use('Agg')
@@ -114,7 +115,8 @@ def test():
 
 
 def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', MD5_MODEL)
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
     with open(tar_file, 'r') as f:
         parameters = Parameters.from_tar(f)
     return parameters
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 5df612bf3530c843c16b337f2b8f83445fcf39b5..c033b27beab52a979c78caeba68990c95b462c56 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -1,11 +1,42 @@
-import sys
-import core
-__all__ = ['proto']
-argv = []
-if core.is_compile_gpu():
-    argv = list(sys.argv) + [
-        "--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"
-    ]
-else:
-    argv = list(sys.argv) + ["--tryfromenv=use_pinned_memory"]
-core.init_gflags(argv)
+# import all class inside framework into fluid module
+import framework
+from framework import *
+# import all class inside executor into fluid module
+import executor
+from executor import *
+
+import io
+import evaluator
+import initializer
+import layers
+import nets
+import optimizer
+import backward
+import regularizer
+from param_attr import ParamAttr
+
+from core import LoDTensor, CPUPlace, GPUPlace
+
+Tensor = LoDTensor
+__all__ = framework.__all__ + executor.__all__ + [
+    'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
+    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
+]
+
+
+def __read_gflags_from_env__():
+    """
+    Enable reading gflags from environment variables.
+    
+    Returns:
+        None
+    """
+    import sys
+    import core
+    read_env_flags = ['use_pinned_memory']
+    if core.is_compile_gpu():
+        read_env_flags.append('fraction_of_gpu_memory_to_use')
+    core.init_gflags(sys.argv + ["--tryfromenv=" + ",".join(read_env_flags)])
+
+
+__read_gflags_from_env__()
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index 3a8f1831cf2c44c81aee62c6ee172942db188217..137c5736226b689340748d5098ca51659d5acff8 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -1,14 +1,18 @@
 import numpy as np
-from paddle.v2.fluid.framework import Program, g_main_program, unique_name, Variable
-import paddle.v2.fluid.core as core
 
+import layers
+from framework import Program, unique_name, Variable
+from layer_helper import LayerHelper
 
-def _clone_var_in_block_(block, var):
+__all__ = ['Accuracy']
+
+
+def _clone_var_(block, var):
     assert isinstance(var, Variable)
     return block.create_var(
         name=var.name,
         shape=var.shape,
-        dtype=var.data_type,
+        dtype=var.dtype,
         type=var.type,
         lod_level=var.lod_level,
         persistable=True)
@@ -16,172 +20,115 @@ def _clone_var_in_block_(block, var):
 
 class Evaluator(object):
     """
-    Evalutor Base class.
-
-    create metric states
-    add mini-batch evaluator caculate operator
-    add increment operator to accumulate the metric states
+    Base Class for all evaluators
+    
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate 
+            temporary variable name.
+        main_program(Program, optional): The evaluator should be added to this 
+            main_program. Default default_main_program()
+        startup_program(Program, optional):The parameter should be added to this 
+            startup_program. Default default_startup_program()
+            
+    Attributes:
+        states(list): The list of state variables. states will be reset to zero 
+            when `reset` is invoked.
+        metrics(list): The list of metrics variables. They will be calculate 
+            every mini-batch
     """
 
     def __init__(self, name, **kwargs):
-        """
-        init the global states
-        """
-        self._states = {}
-        if kwargs.has_key("main_program"):
-            self._main_program = kwargs.get("main_program")
-        else:
-            self._main_program = g_main_program
-
-    def _update_ops(self, *args, **kwargs):
-        """
-        append update ops to the global states
-        """
-        raise NotImplementedError()
+        self.states = []
+        self.metrics = []
+        self.helper = LayerHelper(name, **kwargs)
 
     def reset(self, executor, reset_program=None):
         """
-        Clear metric states at the begin of each pass/user specified batch
+        reset metric states at the begin of each pass/user specified batch
         """
-        if reset_program == None:
+        if reset_program is None:
             reset_program = Program()
-        else:
-            reset_program = program
-        block = reset_program.global_block()
-        for k, var in self._states.iteritems():
-            g_var = _clone_var_in_block_(block, var)
-            zeros = block.create_var(dtype="float32", persistable=True)
-            block.append_op(
-                type="fill_constant",
-                outputs={"Out": [zeros]},
-                attrs={
-                    "shape": g_var.shape,
-                    "value": .0,
-                    "data_type": 5,
-                })
-            block.append_op(
-                type="scale", inputs={"X": zeros}, outputs={"Out": g_var})
-        executor.run(reset_program, fetch_list=self._states.values())
+
+        for var in self.states:
+            assert isinstance(var, Variable)
+            g_var = _clone_var_(reset_program.current_block(), var)
+            layers.fill_constant(
+                shape=g_var.shape,
+                value=0.0,
+                dtype=g_var.dtype,
+                out=g_var,
+                main_program=reset_program)
+
+        executor.run(reset_program)
 
     def eval(self, executor, eval_program=None):
         """
-        Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+        Evaluate the statistics merged by multiple mini-batches.
         """
         raise NotImplementedError()
 
+    def create_state(self, suffix, dtype, shape):
+        """
+        Create state variable. 
+        
+        NOTE: It is not a public API.
+        
+        Args:
+            suffix(str): the state suffix. 
+            dtype(str|core.DataType): the state data type 
+            shape(tuple|list): the shape of state 
+
+        Returns: State variable
+
+        """
+        state = self.helper.create_variable(
+            name="_".join([unique_name(self.helper.name), suffix]),
+            persistable=True,
+            dtype=dtype,
+            shape=shape)
+        self.states.append(state)
+        return state
+
 
 class Accuracy(Evaluator):
     """
-    Accuracy need two state variable Total, Correct
+    Average Accuracy for multiple mini-batches.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, input, label, k=1, **kwargs):
         super(Accuracy, self).__init__("accuracy", **kwargs)
-        block = self._main_program.global_block()
-        g_total = block.create_var(
-            name=unique_name("Total"),
-            persistable=True,
-            dtype="int64",
-            shape=[1])
-        g_correct = block.create_var(
-            name=unique_name("Correct"),
-            persistable=True,
-            dtype="int64",
-            shape=[1])
-        self._states["Total"] = g_total
-        self._states["Correct"] = g_correct
-
-    def _update_ops(self, input, label, k=1, **kwargs):
-        block = self._main_program.global_block()
-        topk_out = block.create_var(dtype=input.data_type)
-        topk_indices = block.create_var(dtype="int64")
-        block.append_op(
-            type="top_k",
-            inputs={"X": [input]},
-            outputs={"Out": [topk_out],
-                     "Indices": [topk_indices]},
-            attrs={"k": k})
-        acc_out = block.create_var(dtype=kwargs.get("out_dtype", "float32"))
-        correct = block.create_var(dtype="int64", persistable=True)
-        total = block.create_var(dtype="int64", persistable=True)
-        block.append_op(
-            type="accuracy",
-            inputs={
-                "Out": [topk_out],
-                "Indices": [topk_indices],
-                "Label": [label]
-            },
-            outputs={
-                "Accuracy": [acc_out],
-                "Correct": [correct],
-                "Total": [total],
-            })
-
-        block.append_op(
-            type="cast",
-            inputs={"X": [self._states["Total"]]},
-            outputs={"Out": [self._states["Total"]]},
-            attrs={
-                "in_data_type": 5,  # float32
-                "out_data_type": 2,  #int32
-            })
-        block.append_op(
-            type="cast",
-            inputs={"X": [self._states["Correct"]]},
-            outputs={"Out": [self._states["Correct"]]},
-            attrs={
-                "in_data_type": 5,
-                "out_data_type": 2,
-            })
-
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": [self._states["Total"]],
-                    "Y": [total]},
-            outputs={"Out": [self._states["Total"]]})
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": [self._states["Correct"]],
-                    "Y": [correct]},
-            outputs={"Out": [self._states["Correct"]]})
-
-        return acc_out
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
+        self.correct = self.create_state(
+            dtype='int64', shape=[1], suffix='correct')
+        kwargs = {'main_program': main_program}
+        total = self.helper.create_tmp_variable(dtype='int')
+        correct = self.helper.create_tmp_variable(dtype='int')
+        acc = layers.accuracy(
+            input=input,
+            label=label,
+            k=k,
+            total=total,
+            correct=correct,
+            **kwargs)
+        total = layers.cast(x=total, dtype='int64', **kwargs)
+        correct = layers.cast(x=correct, dtype='int64', **kwargs)
+        layers.sums(input=[self.total, total], out=self.total, **kwargs)
+        layers.sums(input=[self.correct, correct], out=self.correct, **kwargs)
+
+        self.metrics.append(acc)
 
     def eval(self, executor, eval_program=None):
-        if eval_program != None:
-            eval_program = eval_program
-        else:
+        if eval_program is None:
             eval_program = Program()
-        block = eval_program.global_block()
-        eval_out = block.create_var(dtype=self._states["Total"].data_type)
-        e_total = _clone_var_in_block_(block, self._states["Total"])
-        e_correct = _clone_var_in_block_(block, self._states["Correct"])
-        block.append_op(
-            type="cast",
-            inputs={"X": [e_total]},
-            outputs={"Out": [e_total]},
-            attrs={
-                "in_data_type": 2,  #int32
-                "out_data_type": 5,  #float32
-            })
-        block.append_op(
-            type="cast",
-            inputs={"X": [e_correct]},
-            outputs={"Out": [e_correct]},
-            attrs={
-                "in_data_type": 2,
-                "out_data_type": 5,
-            })
-        block.append_op(
-            type="elementwise_div",
-            inputs={"X": e_correct,
-                    "Y": e_total},
-            outputs={"Out": eval_out})
-        out = executor.run(eval_program, fetch_list=[eval_out])
-        return np.array(out[0])
-
-
-def accuracy(*args, **kwargs):
-    cls = Accuracy(*args, **kwargs)
-    out = cls._update_ops(*args, **kwargs)
-    return cls, out
+        block = eval_program.current_block()
+        kwargs = {'main_program': eval_program}
+        total = _clone_var_(block, self.total)
+        correct = _clone_var_(block, self.correct)
+        total = layers.cast(total, dtype='float32', **kwargs)
+        correct = layers.cast(correct, dtype='float32', **kwargs)
+        out = layers.elementwise_div(x=correct, y=total, **kwargs)
+        return np.array(executor.run(eval_program, fetch_list=[out])[0])
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index ed1c2c06daa7ede97e138049a1f7044d071c31e8..bdc82eede9d93a7cf904999a6b869ce2d23c90dc 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,9 +1,40 @@
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import Block, Program, g_main_program
+import numpy as np
+from . import core
+from framework import Program, default_main_program
+
+__all__ = ['Executor', 'g_scope']
 
 g_scope = core.Scope()
 
 
+def as_numpy(tensor):
+    if isinstance(tensor, list):
+        return [as_numpy(t) for t in tensor]
+    assert isinstance(tensor, core.LoDTensor)
+    lod = tensor.lod()
+    tensor_data = np.array(tensor)
+    if len(lod) == 0:
+        ans = tensor_data
+    else:
+        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
+    # elif len(lod) == 1:
+    #     ans = []
+    #     idx = 0
+    #     while idx < len(lod) - 1:
+    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
+    #         idx += 1
+    # else:
+    #     for l in reversed(lod):
+    #         ans = []
+    #         idx = 0
+    #         while idx < len(l) - 1:
+    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
+    #             idx += 1
+    #         tensor_data = ans
+    #     ans = tensor_data
+    return ans
+
+
 class Executor(object):
     def __init__(self, places):
         if not isinstance(places, list) and not isinstance(places, tuple):
@@ -16,6 +47,47 @@ class Executor(object):
             act_places.append(p)
 
         self.executor = core.Executor(act_places)
+        self.places = places
+
+    def aslodtensor(self, data):
+        def accumulate(data):
+            if not isinstance(data, list):
+                return 1
+            return sum([accumulate(sub) for sub in data])
+
+        def parselod(data):
+            seq_lens = [accumulate(seq) for seq in data]
+            cur_len = 0
+            lod = [cur_len]
+            for l in seq_lens:
+                cur_len += l
+                lod.append(cur_len)
+            return lod
+
+        assert len(self.places) != 0
+        if not isinstance(data, list):
+            # pure tensor case
+            tensor = core.LoDTensor()
+            tensor.set(data, self.places[0])
+            return tensor
+        else:
+            raise RuntimeError("Current implementation lacks unittests")
+            # lodtensor case
+            lod = []
+            if not isinstance(data[0], list):
+                lod.append(parselod(data))
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            else:
+                while isinstance(data[0], list):
+                    lod.append(parselod(seq))
+                    flattened_data = [item for seq in data for item in seq]
+                    data = flattened_data
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            flattened_data = flattened_data.reshape([len(flattened_data), 1])
+            tensor = core.LoDTensor()
+            tensor.set(flattened_data, self.places[0])
+            tensor.set_lod(lod)
+            return tensor
 
     def run(self,
             program=None,
@@ -23,14 +95,15 @@ class Executor(object):
             fetch_list=None,
             feed_var_name='feed',
             fetch_var_name='fetch',
-            scope=None):
+            scope=None,
+            return_numpy=True):
         if feed is None:
             feed = {}
         if fetch_list is None:
             fetch_list = []
 
         if program is None:
-            program = g_main_program
+            program = default_main_program()
 
         if not isinstance(program, Program):
             raise TypeError()
@@ -52,7 +125,10 @@ class Executor(object):
                 inputs={'X': [feed_var]},
                 outputs={'Out': [out]},
                 attrs={'col': i})
-            core.set_feed_variable(scope, feed[name], feed_var.name, i)
+            cur_feed = feed[name]
+            if not isinstance(cur_feed, core.LoDTensor):
+                cur_feed = self.aslodtensor(cur_feed)
+            core.set_feed_variable(scope, cur_feed, feed_var.name, i)
 
         fetch_var = global_block.create_var(
             name=fetch_var_name,
@@ -66,7 +142,11 @@ class Executor(object):
                 attrs={'col': i})
 
         self.executor.run(program.desc, scope, 0, True)
-        return [
+        outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
         ]
+
+        if return_numpy:
+            outs = as_numpy(outs)
+        return outs
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index a6eca2d7194c30aabeafc34de0957792feeebbec..1c42e4d44f5046e0db171fdaeb8e7af38a2cae07 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -1,10 +1,13 @@
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import collections
+
 import numpy as np
-import copy
+from . import core
+import proto.framework_pb2 as framework_pb2
 
-__all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program']
+__all__ = [
+    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
+    'default_main_program'
+]
 
 
 def unique_name(prefix):
@@ -12,6 +15,37 @@ def unique_name(prefix):
     return "_".join([prefix, str(uid)])
 
 
+def convert_np_dtype_to_dtype_(np_dtype):
+    dtype = np.dtype(np_dtype)
+    if dtype == np.float32:
+        return core.DataType.FP32
+    elif dtype == np.float64:
+        return core.DataType.FP64
+    elif dtype == np.float16:
+        return core.DataType.FP16
+    elif dtype == np.int32:
+        return core.DataType.INT32
+    elif dtype == np.int16:
+        return core.DataType.INT16
+    elif dtype == np.int64:
+        return core.DataType.INT64
+    elif dtype == np.bool:
+        return core.DataType.BOOL
+    else:
+        raise ValueError("Not supported numpy dtype " + str(dtype))
+
+
+def dtype_is_floating(dtype):
+    if not isinstance(dtype, core.DataType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if (dtype == core.DataType.FP16 or dtype == core.DataType.FP32 or
+            dtype == core.DataType.FP64):
+        return True
+    else:
+        return False
+
+
 def _debug_string_(proto, throw_on_error=True):
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
@@ -63,11 +97,11 @@ class Variable(object):
                         "matched.".format(self.name, old_shape, shape))
         if dtype is not None:
             if not isinstance(dtype, core.DataType):
-                dtype = Variable._convert_np_dtype_to_dtype_(dtype)
+                dtype = convert_np_dtype_to_dtype_(dtype)
             if is_new_var:
-                self.desc.set_data_type(dtype)
+                self.desc.set_dtype(dtype)
             else:
-                old_dtype = self.data_type
+                old_dtype = self.dtype
                 if dtype != old_dtype:
                     raise ValueError("Variable {0} has been created before. "
                                      "The previous data type is {1}; the new "
@@ -128,8 +162,8 @@ class Variable(object):
         return tuple(self.desc.shape())
 
     @property
-    def data_type(self):
-        return self.desc.data_type()
+    def dtype(self):
+        return self.desc.dtype()
 
     @property
     def lod_level(self):
@@ -145,26 +179,6 @@ class Variable(object):
         uid = core.unique_integer(prefix)  # unique during whole process.
         return "_".join([prefix, str(uid)])
 
-    @staticmethod
-    def _convert_np_dtype_to_dtype_(np_dtype):
-        dtype = np.dtype(np_dtype)
-        if dtype == np.float32:
-            return core.DataType.FP32
-        elif dtype == np.float64:
-            return core.DataType.FP64
-        elif dtype == np.float16:
-            return core.DataType.FP16
-        elif dtype == np.int32:
-            return core.DataType.INT32
-        elif dtype == np.int16:
-            return core.DataType.INT16
-        elif dtype == np.int64:
-            return core.DataType.INT64
-        elif dtype == np.bool:
-            return core.DataType.BOOL
-        else:
-            raise ValueError("Not supported numpy dtype " + str(dtype))
-
 
 def get_all_op_protos():
     """
@@ -232,17 +246,17 @@ class Operator(object):
                     in_proto.name)
 
                 if found:
-                    in_argus = inputs[in_proto.name]
-                    if not isinstance(in_argus, list):
-                        in_argus = [in_argus]
-                    if not in_proto.duplicable and len(in_argus) > 1:
+                    in_args = inputs[in_proto.name]
+                    if not isinstance(in_args, list):
+                        in_args = [in_args]
+                    if not in_proto.duplicable and len(in_args) > 1:
                         raise ValueError(
                             "Input %s expects only one input, but %d are given."
-                            % (in_proto.name, len(in_argus)))
-                    in_argu_names = []
-                    for argu in in_argus:
-                        in_argu_names.append(argu.name)
-                    self.desc.set_input(in_proto.name, in_argu_names)
+                            % (in_proto.name, len(in_args)))
+                    in_arg_names = []
+                    for arg in in_args:
+                        in_arg_names.append(arg.name)
+                    self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
 
@@ -260,18 +274,18 @@ class Operator(object):
                         str(e) for e in given)))
 
             for out_proto in proto.outputs:
-                out_argus = outputs[out_proto.name]
-                if not isinstance(out_argus, list):
-                    out_argus = [out_argus]
-                if not out_proto.duplicable and len(out_argus) > 1:
+                out_args = outputs[out_proto.name]
+                if not isinstance(out_args, list):
+                    out_args = [out_args]
+                if not out_proto.duplicable and len(out_args) > 1:
                     raise ValueError(
                         "Output %s expects only one output, but %d are given." %
-                        (out_proto.name, len(out_argus)))
-                out_argu_names = []
-                for argu in out_argus:
-                    out_argu_names.append(argu.name)
-                    argu.op = self
-                self.desc.set_output(out_proto.name, out_argu_names)
+                        (out_proto.name, len(out_args)))
+                out_arg_names = []
+                for arg in out_args:
+                    out_arg_names.append(arg.name)
+                    arg.op = self
+                self.desc.set_output(out_proto.name, out_arg_names)
 
         if attrs is not None:
             if not isinstance(attrs, dict):
@@ -381,7 +395,11 @@ class Block(object):
         return v
 
     def all_parameters(self):
-        return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
+        return list(self.iter_parameters())
+
+    def iter_parameters(self):
+        return (item[1] for item in self.vars.iteritems()
+                if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
         var = Variable(self, *args, **kwargs)
@@ -455,6 +473,37 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other block
+        Args:
+            other(Block): other block 
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Block):
+            raise TypeError("copy_param_info_from should be invoked with Block")
+        for p in other.iter_parameters():
+            assert isinstance(p, Parameter)
+            v = self.vars.get(p.name, None)
+            if v is None:
+                raise ValueError("copy_param_info_from should be invoked with "
+                                 "same topology")
+            assert isinstance(v, Variable)
+            new_p = Parameter(
+                block=self,
+                shape=v.shape,
+                dtype=v.dtype,
+                type=v.type,
+                lod_level=v.lod_level,
+                stop_gradient=p.stop_gradient,
+                trainable=p.trainable,
+                optimize_attr=p.optimize_attr,
+                regularizer=p.regularizer,
+                name=v.name)
+            self.vars[new_p.name] = new_p
+
 
 class Program(object):
     def __init__(self):
@@ -475,6 +524,7 @@ class Program(object):
         p.desc = core.ProgramDesc(self.desc)
         p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
         p.sync_with_cpp()
+        p.copy_param_info_from(self)
         return p
 
     def prune(self, targets):
@@ -497,6 +547,13 @@ class Program(object):
         res.sync_with_cpp()
         return res
 
+    def inference_optimize(self):
+        res = Program()
+        res.desc = core.inference_optimize(self.desc)
+        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.sync_with_cpp()
+        return res
+
     @staticmethod
     def parse_from_string(binary_str):
         p = Program()
@@ -551,6 +608,24 @@ class Program(object):
         for block in self.blocks:
             block.sync_with_cpp()
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other program. 
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        self.global_block().copy_param_info_from(other.global_block())
+
     def list_vars(self):
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
@@ -579,11 +654,13 @@ class Parameter(Variable):
 
 
 # program is a global instance.
-g_main_program = Program()
-g_startup_program = Program()
+_main_program_ = Program()
+_startup_program_ = Program()
+
 
 def default_startup_program():
-    return g_startup_program
+    return _startup_program_
+
 
 def default_main_program():
-    return g_main_program
+    return _main_program_
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index ded144ecd5db83ce50ca0dc6243fdc52ac0b7a2f..d3f648f8460814a3f251d7aa9560d748af85235c 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -1,10 +1,7 @@
-import paddle.v2.fluid.framework as framework
+import framework
 import numpy as np
 
-__all__ = [
-    'ConstantInitializer', 'UniformInitializer', 'NormalInitializer',
-    'XavierInitializer'
-]
+__all__ = ['Constant', 'Uniform', 'Normal', 'Xavier']
 
 
 class Initializer(object):
@@ -93,7 +90,7 @@ class ConstantInitializer(Initializer):
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "value": self._value
             })
         var.op = op
@@ -140,7 +137,7 @@ class UniformInitializer(Initializer):
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "min": self._low,
                 "max": self._high,
                 "seed": self._seed
@@ -188,7 +185,7 @@ class NormalInitializer(Initializer):
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed
@@ -265,7 +262,7 @@ class XavierInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={
                     "shape": var.shape,
-                    "data_type": int(var.data_type),
+                    "dtype": int(var.dtype),
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
@@ -278,10 +275,109 @@ class XavierInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={
                     "shape": var.shape,
-                    "data_type": int(var.data_type),
+                    "dtype": int(var.dtype),
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
                 })
         var.op = op
         return op
+
+
+class MSRAInitializer(Initializer):
+    """Implements the MSRA initializer a.k.a. Kaiming Initializer
+
+    This class implements the weight initialization from the paper
+    Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
+    and Jian Sun. This is a robust initialization method that particularly
+    considers the rectifier nonlinearities. In case of Uniform distribution,
+    the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
+    distribution, the mean is 0 and the standard deviation
+    is sqrt(2/ fan_in).
+
+    References:
+        [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
+            on ImageNet Classification
+            (https://arxiv.org/abs/1502.01852)
+    """
+
+    def __init__(self, uniform=True, fan_in=None, seed=0):
+        """Constructor for MSRAInitializer
+
+        Args:
+            uniform: whether to use uniform or normal distribution
+            fan_in: fan_in for MSRAInitializer. If None, it is
+                    inferred from the variable.
+            seed: random seed
+
+        Note: It is recommended to set fan_in to None for most cases.
+        """
+        assert uniform is not None
+        assert seed is not None
+        super(MSRAInitializer, self).__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add MSRA initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._uniform:
+            limit = np.sqrt(6.0 / float(fan_in))
+            op = block.prepend_op(
+                type="uniform_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                })
+
+        else:
+            std = np.sqrt(2.0 / float(fan_in))
+            op = block.prepend_op(
+                type="gaussian_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                })
+        var.op = op
+        return op
+
+
+# We short the class name, since users will use the initializer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=ParamAttr(fluid.initializer.Xavier()))
+#
+# It is no need to add an `Initializer` as the class suffix
+Constant = ConstantInitializer
+Uniform = UniformInitializer
+Normal = NormalInitializer
+Xavier = XavierInitializer
+MSRA = MSRAInitializer
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 2d070814eef0b099ba71bef223596e30388ac48a..e147ac22ad289eb00c83def66974d875fcdc31f8 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -1,12 +1,12 @@
 import os
 import cPickle as pickle
 
-from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
-    Variable
+from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
-    'load_persistables', "save_inference_model", "load_inference_model"
+    'load_persistables', "save_inference_model", "load_inference_model",
+    "get_inference_program"
 ]
 
 
@@ -23,7 +23,7 @@ def _clone_var_in_block_(block, var):
     return block.create_var(
         name=var.name,
         shape=var.shape,
-        dtype=var.data_type,
+        dtype=var.dtype,
         type=var.type,
         lod_level=var.lod_level,
         persistable=True)
@@ -45,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
@@ -97,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param executor: executor that save variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default g_program.
+    program which fit `predicate`. Default default_main_program().
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be loaded.
     :param vars: variables need to be loaded. If specify vars, program &
@@ -106,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program's type should be Program")
 
@@ -151,6 +151,17 @@ def load_persistables(executor, dirname, main_program=None):
         predicate=is_persistable)
 
 
+def get_inference_program(target_vars, main_program=None):
+    if main_program is None:
+        main_program = default_main_program()
+    if not isinstance(target_vars, list):
+        target_vars = [target_vars]
+
+    pruned_program = main_program.prune(targets=target_vars)
+    inference_program = pruned_program.inference_optimize()
+    return inference_program
+
+
 def save_inference_model(dirname,
                          feeded_var_names,
                          target_vars,
@@ -165,25 +176,26 @@ def save_inference_model(dirname,
     :param target_vars: Variables from which we can get inference results.
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
-    Default g_main_program.
+            Default default_main_program().
 
     :return: None
     """
     if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
-    pruned_program = main_program.prune(target_vars)
+    pruned_program = main_program.prune(targets=target_vars)
+    inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
 
     model_file_name = dirname + "/__model__"
     with open(model_file_name, "w") as f:
         pickle.dump({
-            "program_desc_str": pruned_program.desc.serialize_to_string(),
+            "program_desc_str": inference_program.desc.serialize_to_string(),
             "feed_var_names": feeded_var_names,
             "fetch_var_names": fetch_var_names
         }, f, -1)
@@ -259,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None):
     :param executor: executor for retrieving the value
     :param name: the name of the parameter
     :param program: the program where the variable is found
-    Default g_main_program.
+            Default default_main_program().
     :return: the LoDTensor for the variable
     """
     if program is None:
-        program = g_main_program
+        program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index a97e07982bd89be72386970f28a0dd049f82372d..5b384e5cf5df5e5abc7f0ef81ff11cd8a31cfa2d 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,10 +1,10 @@
 import copy
 import itertools
 
-from paddle.v2.fluid.framework import Variable, g_main_program, \
-    g_startup_program, unique_name, Program
-from paddle.v2.fluid.initializer import ConstantInitializer, \
-    UniformInitializer, XavierInitializer
+from framework import Variable, default_main_program, default_startup_program, \
+    unique_name, dtype_is_floating
+from paddle.v2.fluid.initializer import Constant, Xavier
+from param_attr import ParamAttr
 
 
 class LayerHelper(object):
@@ -23,7 +23,7 @@ class LayerHelper(object):
     def main_program(self):
         prog = self.kwargs.get('main_program', None)
         if prog is None:
-            return g_main_program
+            return default_main_program()
         else:
             return prog
 
@@ -31,7 +31,7 @@ class LayerHelper(object):
     def startup_program(self):
         prog = self.kwargs.get('startup_program', None)
         if prog is None:
-            return g_startup_program
+            return default_startup_program()
         else:
             return prog
 
@@ -61,31 +61,15 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        default = {'name': None, 'initializer': XavierInitializer()}
-        actual = self.kwargs.get('param_attr', None)
-        if actual is None:
-            actual = default
-        for default_field in default.keys():
-            if default_field not in actual:
-                actual[default_field] = default[default_field]
-        return actual
+        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
 
     @property
     def bias_attr(self):
-        default = {'name': None, 'initializer': ConstantInitializer()}
-        bias_attr = self.kwargs.get('bias_attr', None)
-        if bias_attr is None:
-            bias_attr = default
-
-        if isinstance(bias_attr, dict):
-            for default_field in default.keys():
-                if default_field not in bias_attr:
-                    bias_attr[default_field] = default[default_field]
-        return bias_attr
+        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
 
     def multiple_param_attr(self, length):
         param_attr = self.param_attr
-        if isinstance(param_attr, dict):
+        if isinstance(param_attr, ParamAttr):
             param_attr = [param_attr]
 
         if len(param_attr) != 1 and len(param_attr) != length:
@@ -108,23 +92,35 @@ class LayerHelper(object):
         dtype = None
         for each in inputs:
             if dtype is None:
-                dtype = each.data_type
-            elif dtype != each.data_type:
+                dtype = each.dtype
+            elif dtype != each.dtype:
                 raise ValueError("Data Type mismatch")
         return dtype
 
-    def create_parameter(self, attr, shape, dtype, suffix='w',
-                         initializer=None):
+    def create_parameter(self,
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
         # Deepcopy the attr so that parameters can be shared in program
-        attr_copy = copy.deepcopy(attr)
-        if initializer is not None:
-            attr_copy['initializer'] = initializer
-        if attr_copy['name'] is None:
-            attr_copy['name'] = unique_name(".".join([self.name, suffix]))
+        assert isinstance(attr, ParamAttr)
+        suffix = 'b' if is_bias else 'w'
+
+        if default_initializer is None:
+            if is_bias:
+                attr.set_default_bias_initializer()
+            else:
+                attr.set_default_param_initializer()
+        else:
+            attr.set_default_initializer(default_initializer)
+        if attr.name is None:
+            attr.name = unique_name(".".join([self.name, suffix]))
+
         self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr_copy)
+            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
         return self.main_program.global_block().create_parameter(
-            name=attr_copy['name'], dtype=dtype, shape=shape)
+            dtype=dtype, shape=shape, **attr.to_kwargs())
 
     def create_tmp_variable(self, dtype):
         return self.main_program.current_block().create_var(
@@ -144,7 +140,7 @@ class LayerHelper(object):
         self.startup_program.global_block().create_var(
             name=var.name,
             type=var.type,
-            dtype=var.data_type,
+            dtype=var.dtype,
             shape=var.shape,
             persistable=True,
             initializer=initializer)
@@ -154,8 +150,10 @@ class LayerHelper(object):
         Append bias operator and return its output. If the user does not set
         bias_attr, append_bias_op will return input_var
 
-        :param input_var: the input variable. The len(input_var.shape) is larger
-        or equal than 2.
+        :param input_var: the input variable. The len(input_var.shape) is
+        larger or equal than 2.
+        :bias_initializer: an instance of a subclass of Initializer used to
+        initialize the bias
         :param dim_start:
         :param dim_end: the shape of the bias will be
         input_var.shape[dim_start:dim_end]. The bias is broadcasted to other
@@ -167,8 +165,8 @@ class LayerHelper(object):
             return input_var
 
         b = self.create_parameter(
-            attr=bias_attr, shape=size, dtype=input_var.data_type, suffix='b')
-        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
+        tmp = self.create_tmp_variable(dtype=input_var.dtype)
         self.append_op(
             type='elementwise_add',
             inputs={'X': [input_var],
@@ -183,7 +181,7 @@ class LayerHelper(object):
             return input_var
         if isinstance(act, basestring):
             act = {'type': act}
-        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+        tmp = self.create_tmp_variable(dtype=input_var.dtype)
         act_type = act.pop('type')
         self.append_op(
             type=act_type,
@@ -191,3 +189,10 @@ class LayerHelper(object):
             outputs={"Y": [tmp]},
             attrs=act)
         return tmp
+
+    def _get_default_initializer(self, dtype):
+        if dtype is None or dtype_is_floating(dtype) is True:
+            return Xavier()
+        else:
+            # For integer and boolean types, initialize with all zeros
+            return Constant()
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
index 1789d2f82a8813331b3610fc69f8447925cd7501..9dcc11d21618ec12ac6a2112ed8e307ab028f6c0 100644
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -1,12 +1,11 @@
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
-from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
-    Operator
-from paddle.v2.fluid.initializer import ConstantInitializer, \
-    NormalInitializer
+import core
+import proto.framework_pb2 as framework_pb2
+from framework import OpProtoHolder, Variable, Program, Operator
+from initializer import Constant, Normal, Xavier, Initializer
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO
+from param_attr import ParamAttr
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
@@ -17,11 +16,11 @@ __all__ = [
 
 def fc(input,
        size,
+       num_flatten_dims=1,
        param_attr=None,
        bias_attr=None,
-       name=None,
        act=None,
-       num_flatten_dims=1,
+       name=None,
        main_program=None,
        startup_program=None):
     """
@@ -30,11 +29,15 @@ def fc(input,
     Args:
        input: The input tensor to the function
        size: The size of the layer
+       num_flatten_dims: Number of columns in input
        param_attr: The parameters/weights to the FC Layer
+       param_initializer: Initializer used for the weight/parameter.
+       If None, XavierInitializer() is used
        bias_attr: The bias parameter for the FC layer
-       name: Name/alias of the function
+       bias_initializer: Initializer used for the bias.
+       If None, then ConstantInitializer() is used
        act: Activation to be applied to the output of FC layer
-       num_flatten_dims: Number of columns in input
+       name: Name/alias of the function
        main_program: Name of the main program that calls this
        startup_program: Name of the startup program
 
@@ -61,7 +64,7 @@ def fc(input,
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
         w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype)
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
         tmp = helper.create_tmp_variable(dtype)
         helper.append_op(
             type="mul",
@@ -89,20 +92,21 @@ def fc(input,
 
 def embedding(input,
               size,
-              data_type='float32',
               is_sparse=False,
               param_attr=None,
+              dtype='float32',
               main_program=None,
               startup_program=None):
     """
     Embedding Layer.
 
     Args:
+       param_initializer:
        input: The input to the function
        size: The size of the layer
-       data_type: The type of data : float32, float_16, int etc
        is_sparse: A flag that decleares whether the input is sparse
        param_attr: Parameters for this layer
+       dtype: The type of data : float32, float_16, int etc
        main_program: Name of the main program that calls this
        startup_program: Name of the startup program
 
@@ -114,10 +118,11 @@ def embedding(input,
     to the LayerHelper constructor.
 
     """
+
     helper = LayerHelper('embedding', **locals())
     w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=data_type)
-    tmp = helper.create_tmp_variable(data_type)
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    tmp = helper.create_tmp_variable(dtype)
     helper.append_op(
         type='lookup_table',
         inputs={'Ids': input,
@@ -130,7 +135,6 @@ def embedding(input,
 # TODO(qijun): expose H0 and C0
 def dynamic_lstm(input,
                  size,
-                 data_type='float32',
                  param_attr=None,
                  bias_attr=None,
                  use_peepholes=True,
@@ -138,22 +142,23 @@ def dynamic_lstm(input,
                  gate_activation='sigmoid',
                  cell_activation='tanh',
                  candidate_activation='tanh',
+                 dtype='float32',
                  main_program=None,
                  startup_program=None):
     helper = LayerHelper('lstm', **locals())
     size = size / 4
     weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=data_type)
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
     bias_size = [1, 7 * size]
     if not use_peepholes:
         bias_size[1] = 4 * size
     bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=data_type, suffix='b')
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
-    hidden = helper.create_tmp_variable(data_type)
-    cell = helper.create_tmp_variable(data_type)
-    batch_gate = helper.create_tmp_variable(data_type)
-    batch_cell_pre_act = helper.create_tmp_variable(data_type)
+    hidden = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type='lstm',
@@ -178,9 +183,9 @@ def dynamic_lstm(input,
 
 def data(name,
          shape,
-         data_type='float32',
-         type=core.VarDesc.VarType.LOD_TENSOR,
          append_batch_size=True,
+         dtype='float32',
+         type=core.VarDesc.VarType.LOD_TENSOR,
          main_program=None,
          startup_program=None,
          stop_gradient=True):
@@ -190,9 +195,9 @@ def data(name,
     Args:
        name: The name/alias of the function
        shape: Tuple declaring the shape.
-       data_type: The type of data : float32, float_16, int etc
-       type: The output type. By default it is LOD_TENSOR.
        append_batch_size: Whether or not to append the data as a batch.
+       dtype: The type of data : float32, float_16, int etc
+       type: The output type. By default it is LOD_TENSOR.
        main_program: Name of the main program that calls this
        startup_program: Name of the startup program
        stop_gradient: A boolean that mentions whether gradient should flow.
@@ -221,12 +226,12 @@ def data(name,
     return helper.create_global_variable(
         name=name,
         shape=shape,
-        dtype=data_type,
+        dtype=dtype,
         type=type,
         stop_gradient=stop_gradient)
 
 
-def create_tensor(dtype, name=None, main_program=None):
+def create_tensor(dtype, name=None, main_program=None, startup_program=None):
     helper = LayerHelper("create_tensor", **locals())
     return helper.create_variable(name=helper.name, dtype=dtype)
 
@@ -332,9 +337,9 @@ def _create_op_func_(op_type):
     o_name = not_intermediate_outputs[0].name
     intermediate_output_names = [output.name for output in intermediate_outputs]
 
-    def infer_and_check_data_type(op_proto, **kwargs):
+    def infer_and_check_dtype(op_proto, **kwargs):
         """
-        This function performs the sanity check for data_type and
+        This function performs the sanity check for dtype and
         instance type.
         """
         dtype = None
@@ -349,8 +354,8 @@ def _create_op_func_(op_type):
                         op_type))
 
                 if dtype is None:
-                    dtype = each.data_type
-                elif dtype != each.data_type:
+                    dtype = each.dtype
+                elif dtype != each.dtype:
                     raise ValueError(
                         "operator {0} must input same dtype".format(op_type))
 
@@ -359,7 +364,7 @@ def _create_op_func_(op_type):
     def func(**kwargs):
         helper = LayerHelper(op_type, **kwargs)
 
-        dtype = infer_and_check_data_type(op_proto, **kwargs)
+        dtype = infer_and_check_dtype(op_proto, **kwargs)
 
         inputs = dict()
         for ipt in op_proto.inputs:
@@ -388,45 +393,28 @@ def _create_op_func_(op_type):
 _create_op_func_('mean')
 _create_op_func_('mul')
 _create_op_func_('elementwise_add')
+_create_op_func_('elementwise_div')
 _create_op_func_('dropout')
 _create_op_func_('reshape')
-_create_op_func_('elementwise_add')
 _create_op_func_('sigmoid')
 _create_op_func_('scale')
 _create_op_func_('reshape')
 _create_op_func_('transpose')
 
 
-def fill_constant(data_type, shape, value=None, program=None):
-    """
-    This function creates a tensor , with shape as mentioned in the input and
-    specified data_type and fills this up with a constant value that
-    comes in the input.
-    """
-    helper = LayerHelper('fill_constant', **locals())
-    out = helper.create_tmp_variable(dtype=data_type)
-    helper.append_op(
-        type='fill_constant',
-        outputs={'Out': [out]},
-        attrs={'data_type': data_type,
-               'shape': shape,
-               'value': value})
-    return out
-
-
-def cast(x, data_type, main_program=None):
+def cast(x, dtype, main_program=None):
     """
-    This function takes in the input with input_data_type
-    and casts it to the output_data_type as the output.
+    This function takes in the input with input_dtype
+    and casts it to the output_dtype as the output.
     """
     helper = LayerHelper('cast', **locals())
-    out = helper.create_tmp_variable(dtype=data_type)
+    out = helper.create_tmp_variable(dtype=dtype)
     helper.append_op(
         type='cast',
         inputs={'X': [x]},
         outputs={'Out': [out]},
-        attrs={'in_data_type': x.data_type,
-               'out_data_type': out.data_type})
+        attrs={'in_dtype': x.dtype,
+               'out_dtype': out.dtype})
     return out
 
 
@@ -445,18 +433,49 @@ def concat(input, axis, main_program=None, startup_program=None):
     return out
 
 
-def sums(input, main_program=None, startup_program=None):
+def sums(input, out=None, main_program=None, startup_program=None):
     """
     This function takes in the input and performs the sum operation on it
     and returns that as the output.
     """
     helper = LayerHelper('sum', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
     return out
 
 
-def assign(input, output, main_program=None):
+def linear_chain_crf(input,
+                     label,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('linear_chain_crf', **locals())
+    size = input.shape[1]
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype())
+    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
+    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        })
+
+    return log_likelihood
+
+
+def assign(input, output, main_program=None, startup_program=None):
     helper = LayerHelper('assign', **locals())
     helper.append_op(
         type='scale',
@@ -468,12 +487,12 @@ def assign(input, output, main_program=None):
 
 def split_lod_tensor(input,
                      mask,
-                     level,
+                     level=0,
                      main_program=None,
                      startup_program=None):
     helper = LayerHelper('split_lod_tensor', **locals())
-    out_true = helper.create_tmp_variable(dtype=input.data_type)
-    out_false = helper.create_tmp_variable(dtype=input.data_type)
+    out_true = helper.create_tmp_variable(dtype=input.dtype)
+    out_false = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
         type='split_lod_tensor',
         inputs={
@@ -490,11 +509,11 @@ def merge_lod_tensor(in_true,
                      in_false,
                      x,
                      mask,
-                     level,
+                     level=0,
                      main_program=None,
                      startup_program=None):
     helper = LayerHelper('merge_lod_tensor', **locals())
-    out = helper.create_tmp_variable(dtype=x.data_type)
+    out = helper.create_tmp_variable(dtype=in_true.dtype)
     helper.append_op(
         type='merge_lod_tensor',
         inputs={'X': x,
@@ -512,9 +531,9 @@ def cos_sim(X, Y, **kwargs):
     X and Y and returns that as the output.
     """
     helper = LayerHelper('cos_sim', **kwargs)
-    out = helper.create_tmp_variable(dtype=X.data_type)
-    xnorm = helper.create_tmp_variable(dtype=X.data_type)
-    ynorm = helper.create_tmp_variable(dtype=X.data_type)
+    out = helper.create_tmp_variable(dtype=X.dtype)
+    xnorm = helper.create_tmp_variable(dtype=X.dtype)
+    ynorm = helper.create_tmp_variable(dtype=X.dtype)
     helper.append_op(
         type='cos_sim',
         inputs={'X': [X],
@@ -530,7 +549,7 @@ def cross_entropy(input, label, **kwargs):
     This function computes cross_entropy using the input and label.
     """
     helper = LayerHelper('cross_entropy', **kwargs)
-    out = helper.create_tmp_variable(dtype=input.data_type)
+    out = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
         type='cross_entropy',
         inputs={'X': [input],
@@ -546,26 +565,26 @@ def square_error_cost(input, label, **kwargs):
     The output is appending the op to do the above.
     """
     helper = LayerHelper('square_error_cost', **kwargs)
-    minus_out = helper.create_tmp_variable(dtype=input.data_type)
+    minus_out = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
         type='elementwise_sub',
         inputs={'X': [input],
                 'Y': [label]},
         outputs={'Out': [minus_out]})
 
-    square_out = helper.create_tmp_variable(dtype=input.data_type)
+    square_out = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
         type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
     return square_out
 
 
-def accuracy(input, label, k=1, **kwargs):
+def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
     """
     This function computes the accuracy using the input and label.
     The output is the top_k inputs and their indices.
     """
     helper = LayerHelper("accuracy", **kwargs)
-    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
     topk_indices = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="top_k",
@@ -573,10 +592,11 @@ def accuracy(input, label, k=1, **kwargs):
         outputs={"Out": [topk_out],
                  "Indices": [topk_indices]},
         attrs={"k": k})
-    acc_out_dtype = kwargs.get("out_dtype", "float32")
     acc_out = helper.create_tmp_variable(dtype="float32")
-    correct = helper.create_tmp_variable(dtype="int64")
-    total = helper.create_tmp_variable(dtype="int64")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="accuracy",
         inputs={
@@ -596,10 +616,10 @@ def sequence_conv(input,
                   num_filters,
                   filter_size=3,
                   filter_stride=1,
-                  act=None,
                   padding=None,
                   bias_attr=None,
                   param_attr=None,
+                  act=None,
                   main_program=None,
                   startup_program=None):
     """
@@ -607,13 +627,13 @@ def sequence_conv(input,
     other convolutional configurations for the filters and stride as given
     in the input parameters to the function.
     """
+
     # FIXME(dzh) : want to unify the argument of python layer
     # function. So we ignore some unecessary attributes.
     # such as, padding_trainable, context_start.
 
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
-
     filter_shape = [filter_size * input.shape[1], num_filters]
     filter = helper.create_parameter(
         attr=helper.param_attr, shape=filter_shape, dtype=dtype)
@@ -637,14 +657,14 @@ def sequence_conv(input,
 
 def conv2d(input,
            num_filters,
-           name=None,
-           filter_size=[1, 1],
-           act=None,
-           groups=None,
+           filter_size,
            stride=[1, 1],
            padding=None,
-           bias_attr=None,
+           groups=None,
            param_attr=None,
+           bias_attr=None,
+           act=None,
+           name=None,
            main_program=None,
            startup_program=None):
     """
@@ -654,6 +674,7 @@ def conv2d(input,
     This funciton can also append an activation on top of the
     conv-2d output, if mentioned in the input parameters.
     """
+
     helper = LayerHelper('conv2d', **locals())
     dtype = helper.input_dtype()
 
@@ -661,7 +682,7 @@ def conv2d(input,
     if groups is None:
         num_filter_channels = num_channels
     else:
-        if num_channels % groups is not 0:
+        if num_channels % groups != 0:
             raise ValueError("num_channels must be divisible by groups.")
         num_filter_channels = num_channels / groups
 
@@ -675,12 +696,16 @@ def conv2d(input,
     input_shape = input.shape
     filter_shape = [num_filters, num_filter_channels] + filter_size
 
-    std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
     filter = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        initializer=NormalInitializer(0.0, std, 0))
+        default_initializer=_get_default_param_initializer())
+
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -795,22 +820,18 @@ def batch_norm(input,
         attr=helper.param_attr,
         shape=param_shape,
         dtype=dtype,
-        initializer=ConstantInitializer(1.0))
+        default_initializer=Constant(1.0))
+
     bias = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        initializer=ConstantInitializer(0.0))
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
     mean = helper.create_global_variable(
-        dtype=input.data_type, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(
-        var=mean, initializer=ConstantInitializer(0.0))
+        dtype=input.dtype, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
 
     variance = helper.create_global_variable(
-        dtype=input.data_type, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(
-        var=variance, initializer=ConstantInitializer(1.0))
+        dtype=input.dtype, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
 
     # create output
     # mean and mean_out share the same memory
@@ -847,8 +868,8 @@ def batch_norm(input,
 
 def beam_search_decode(ids, scores, main_program=None, startup_program=None):
     helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.data_type)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.data_type)
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
 
     helper.append_op(
         type="beam_search_decode",
@@ -986,7 +1007,7 @@ class StaticRNN(object):
             boot_var = parent_block.create_var(
                 name=var_name,
                 shape=shape,
-                dtype=batch_ref.data_type,
+                dtype=batch_ref.dtype,
                 persistable=False)
 
             parent_block.append_op(
@@ -996,7 +1017,7 @@ class StaticRNN(object):
                 attrs={
                     'value': init_value,
                     'shape': boot_var.shape,
-                    'data_type': boot_var.data_type,
+                    'dtype': boot_var.dtype,
                     'input_dim_idx': ref_batch_dim_idx,
                     'output_dim_idx': init_batch_dim_idx
                 })
@@ -1005,7 +1026,7 @@ class StaticRNN(object):
         else:
             pre_mem = self.helper.create_variable(
                 name=unique_name("@".join([self.helper.name, "mem"])),
-                dtype=init.data_type,
+                dtype=init.dtype,
                 shape=init.shape)
             self.memories[pre_mem.name] = StaticRNNMemoryLink(
                 init=init, pre_mem=pre_mem)
@@ -1021,10 +1042,7 @@ class StaticRNN(object):
             raise ValueError("Static RNN only take fix seq_len input")
 
         ipt = self.helper.create_variable(
-            name=x.name,
-            dtype=x.data_type,
-            shape=list(x.shape[1:]),
-            type=x.type)
+            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
         self.inputs.append(ipt)
         return ipt
 
@@ -1033,17 +1051,17 @@ class StaticRNN(object):
         if not isinstance(o, Variable):
             raise TypeError("step output takes a Variable")
 
-        tmp_o = self.helper.create_tmp_variable(dtype=o.data_type)
+        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
         self.helper.append_op(
             type='rnn_memory_helper',
             inputs={'X': [o]},
             outputs={'Out': tmp_o},
-            attrs={'data_type': o.data_type})
+            attrs={'dtype': o.dtype})
 
         out_var = self.parent_block().create_var(
             name=tmp_o.name,
             shape=[self.seq_len] + list(tmp_o.shape),
-            dtype=tmp_o.data_type)
+            dtype=tmp_o.dtype)
 
         self.outputs.append(out_var)
 
@@ -1115,13 +1133,13 @@ class StaticRNN(object):
             pre_memories.append(mem.pre_mem.name)
             mem_var = rnn_block.var(mem.mem.name)
             assert isinstance(mem_var, Variable)
-            new_mem = self.helper.create_tmp_variable(dtype=mem_var.data_type)
+            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
 
             rnn_block.append_op(
                 type='rnn_memory_helper',
                 inputs={'X': [mem_var]},
                 outputs={'Out': [new_mem]},
-                attrs={'data_type': mem_var.data_type})
+                attrs={'dtype': mem_var.dtype})
 
             memories.append(new_mem.name)
 
@@ -1171,7 +1189,7 @@ class While(object):
         if not isinstance(cond, Variable):
             raise TypeError("condition should be a variable")
         assert isinstance(cond, Variable)
-        if cond.data_type != core.DataType.BOOL:
+        if cond.dtype != core.DataType.BOOL:
             raise TypeError("condition should be a bool variable")
         if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
             raise TypeError("condition should be a bool scalar")
@@ -1243,9 +1261,9 @@ def lstm(x,
                       main_program=main_program,
                       startup_program=startup_program)
 
-        data_type = x.data_type
-        c = helper.create_tmp_variable(data_type)
-        h = helper.create_tmp_variable(data_type)
+        dtype = x.dtype
+        c = helper.create_tmp_variable(dtype)
+        h = helper.create_tmp_variable(dtype)
 
         helper.append_op(
             type='lstm_unit',
@@ -1278,6 +1296,33 @@ def lod_rank_table(x, level=0, main_program=None):
     return table
 
 
+def max_sequence_len(rank_table, main_program=None):
+    """
+    This function creates an operator to calculate the length of
+    max seqence through input rank_table(should be a lod_rank_table)
+    """
+    helper = LayerHelper("max_seqence_len", **locals())
+    res = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="max_sequence_len",
+        inputs={"RankTable": rank_table},
+        outputs={"Out": res})
+    return res
+
+
+def topk(input, k, main_program=None, startup_program=None):
+    helper = LayerHelper('topk', **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_indices = helper.create_tmp_variable(dtype='int64')
+    helper.append_op(
+        type='top_k',
+        inputs={'X': [input]},
+        outputs={'Out': [topk_out],
+                 'Indices': [topk_indices]},
+        attrs={'k': k})
+    return topk_out, topk_indices
+
+
 def lod_tensor_to_array(x, table, main_program=None):
     """
     This function creates an operator to convert an LOD_Tensor to
@@ -1287,7 +1332,7 @@ def lod_tensor_to_array(x, table, main_program=None):
     array = helper.create_variable(
         name=unique_name("lod_tensor_to_array"),
         type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=x.data_type)
+        dtype=x.dtype)
     helper.append_op(
         type='lod_tensor_to_array',
         inputs={'X': x,
@@ -1302,7 +1347,7 @@ def array_to_lod_tensor(x, table, main_program=None):
     LOD_Tensor.
     """
     helper = LayerHelper("array_to_lod_tensor", **locals())
-    tmp = helper.create_tmp_variable(dtype=x.data_type)
+    tmp = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type="array_to_lod_tensor",
         inputs={'X': x,
@@ -1311,22 +1356,51 @@ def array_to_lod_tensor(x, table, main_program=None):
     return tmp
 
 
-def fill_constant(shape, dtype, value, main_program=None):
+def fill_constant(shape,
+                  dtype,
+                  value,
+                  out=None,
+                  main_program=None,
+                  startup_program=None):
     """
     This function creates a tensor , with shape as mentioned in the input and
-    specified data_type and fills this up with a constant value that
+    specified dtype and fills this up with a constant value that
     comes in the input. It also sets the stop_gradient to be True.
     """
     helper = LayerHelper("fill_constant", **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
+    if out is None:
+        out = helper.create_tmp_variable(dtype=dtype)
     helper.append_op(
         type='fill_constant',
         inputs={},
         outputs={'Out': [out]},
+        attrs={'shape': shape,
+               'dtype': out.dtype,
+               'value': float(value)})
+    out.stop_gradient = True
+    return out
+
+
+def fill_constant_batch_size_like(input,
+                                  shape,
+                                  dtype,
+                                  value,
+                                  input_dim_idx=0,
+                                  output_dim_idx=0,
+                                  main_program=None,
+                                  startup_program=None):
+    helper = LayerHelper("fill_constant_batch_size_like", **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': [out]},
         attrs={
             'shape': shape,
-            'data_type': out.data_type,
-            'value': float(value)
+            'dtype': out.dtype,
+            'value': float(value),
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx
         })
     out.stop_gradient = True
     return out
@@ -1356,7 +1430,7 @@ def increment(x, value=1.0, in_place=True, main_program=None):
     """
     helper = LayerHelper("increment", **locals())
     if not in_place:
-        out = helper.create_tmp_variable(dtype=x.data_type)
+        out = helper.create_tmp_variable(dtype=x.dtype)
     else:
         out = x
     helper.append_op(
@@ -1377,7 +1451,7 @@ def array_write(x, i, array=None, main_program=None):
         array = helper.create_variable(
             name="{0}.out".format(helper.name),
             type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.data_type)
+            dtype=x.dtype)
     helper.append_op(
         type='write_to_array',
         inputs={'X': [x],
@@ -1394,7 +1468,7 @@ def create_array(dtype, main_program=None):
         dtype=dtype)
 
 
-def less_than(x, y, cond=None, main_program=None):
+def less_than(x, y, cond=None, main_program=None, **ignored):
     helper = LayerHelper("less_than", **locals())
     if cond is None:
         cond = helper.create_tmp_variable(dtype='bool')
@@ -1416,7 +1490,7 @@ def array_read(array, i, main_program=None):
             array,
             Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         raise TypeError("array should be tensor array vairable")
-    out = helper.create_tmp_variable(dtype=array.data_type)
+    out = helper.create_tmp_variable(dtype=array.dtype)
     helper.append_op(
         type='read_from_array',
         inputs={'X': [array],
@@ -1431,7 +1505,7 @@ def shrink_memory(x, i, table, main_program=None):
     as mentioned in the input parameter.
     """
     helper = LayerHelper('shrink_memory', **locals())
-    out = helper.create_tmp_variable(dtype=x.data_type)
+    out = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type='shrink_rnn_memory',
         inputs={'X': [x],
@@ -1455,6 +1529,93 @@ def array_length(array, main_program=None):
     return tmp
 
 
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    """
+    The transpose of conv2d layer.
+
+    This layer is also known as deconvolution layer.
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        param_attr: Parameter Attribute.
+        main_program(Program): the main program
+        startup_program(Program): the startup program
+
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = stride
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+        filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
+        filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
+        filter_size = [filter_size_h, filter_size_w]
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+
+    return out
+
+
 class ConditionalBlockGuard(BlockGuard):
     def __init__(self, block):
         if not isinstance(block, ConditionalBlock):
@@ -1472,13 +1633,20 @@ class ConditionalBlockGuard(BlockGuard):
 
 
 class ConditionalBlock(object):
-    def __init__(self, inputs, name=None, main_program=None):
+    def __init__(self,
+                 inputs,
+                 name=None,
+                 main_program=None,
+                 startup_program=None):
         for each_input in inputs:
             if not isinstance(each_input, Variable):
                 raise TypeError("Each input should be variable")
         self.inputs = inputs
         self.helper = LayerHelper(
-            'conditional_block', name=name, main_program=main_program)
+            'conditional_block',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
 
     def block(self):
         return ConditionalBlockGuard(self)
@@ -1523,3 +1691,148 @@ class ConditionalBlock(object):
             outputs={'Out': out_list,
                      'Scope': [step_scope]},
             attrs={'block': inside_block})
+
+
+class IfElseBlockGuard(object):
+    def __init__(self, is_true, ifelse):
+        if not isinstance(ifelse, IfElse):
+            raise TypeError("ifelse must be an instance of IfElse class")
+
+        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("You cannot invoke IfElse.block() inside a block")
+
+        self.is_true = is_true
+        self.ie = ifelse
+        if is_true:
+            self.cond_block = ifelse.conditional_true_block
+        else:
+            self.cond_block = ifelse.conditional_false_block
+
+        if not isinstance(self.cond_block, ConditionalBlock):
+            raise TypeError("Unexpected situation")
+
+        self.cond_block = self.cond_block.block()
+
+    def __enter__(self):
+        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
+        self.cond_block.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
+            # re-raise inside exception
+            return False
+        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
+            raise ValueError("Must set output inside block")
+        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
+
+
+class IfElse(object):
+    OUT_IF_ELSE_BLOCKS = 0
+    IN_IF_ELSE_TRUE_BLOCKS = 1
+    IN_IF_ELSE_FALSE_BLOCKS = 2
+
+    def __init__(self, cond, name=None, main_program=None,
+                 startup_program=None):
+        if not isinstance(cond, Variable):
+            raise TypeError("cond must be a Variable")
+        self.helper = LayerHelper(
+            'ifelse',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+        self.cond = cond
+        self.input_table = {}
+        self.status = IfElse.OUT_IF_ELSE_BLOCKS
+        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
+        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
+        self.output_table = ([], [])  # (true_outs, false_outs)
+
+    def input(self, x):
+        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("input must in true/false blocks")
+        if id(x) not in self.input_table:
+            parent_block = self.parent_block()
+            out_true = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+
+            out_false = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+            parent_block.append_op(
+                type='split_lod_tensor',
+                inputs={
+                    'X': x,
+                    'Mask': self.cond,
+                },
+                outputs={'OutTrue': out_true,
+                         'OutFalse': out_false},
+                attrs={'level': 0})
+            self.input_table[id(x)] = (out_true, out_false)
+        else:
+            out_true, out_false = self.input_table[id(x)]
+
+        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
+            return out_true
+        else:
+            return out_false
+
+    def parent_block(self):
+        current_block = self.helper.main_program.current_block()
+        return self.helper.main_program.block(current_block.parent_idx)
+
+    def true_block(self):
+        return IfElseBlockGuard(True, self)
+
+    def false_block(self):
+        return IfElseBlockGuard(False, self)
+
+    def output(self, *outs):
+        if self.status == self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("output can only be invoked in the sub-block")
+
+        out_table = self.output_table[1 if self.status ==
+                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
+        parent_block = self.parent_block()
+        for each_out in outs:
+            if not isinstance(each_out, Variable):
+                raise TypeError("Each output should be a variable")
+            # create outside tensor
+            outside_out = parent_block.create_var(
+                name=unique_name("_".join([self.helper.name, 'output'])),
+                dtype=each_out.dtype)
+            out_table.append(outside_out)
+
+            # assign local var to outside
+            assign(
+                input=each_out,
+                output=outside_out,
+                main_program=self.helper.main_program,
+                startup_program=self.helper.startup_program)
+
+    def __call__(self):
+        if self.status != self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("IfElse::__call__ must be out of sub-block")
+        false_len, true_len = map(len, self.output_table)
+        if false_len == 0 and true_len == 0:
+            raise ValueError("Must invoke true_block/false_block before "
+                             "__call__")
+        elif false_len != true_len and false_len != 0 and true_len != 0:
+            raise ValueError("The output side must be same")
+        elif false_len == 0 or true_len == 0:
+            return self.output_table[0 if false_len != 0 else 1]
+
+        # else none of false_len/true_len is zero
+        # merge together
+        rlist = []
+        for false_var, true_var in zip(*self.output_table):
+            rlist.append(
+                merge_lod_tensor(
+                    in_true=true_var,
+                    in_false=false_var,
+                    mask=self.cond,
+                    x=self.cond,
+                    level=0,
+                    main_program=self.helper.main_program,
+                    startup_program=self.helper.startup_program))
+        return rlist
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index 5e14ca594bc7965dc29039ba57bb7b26b1ce6871..05728ad75a5bd1e87aa3c75ffcc4eac34b6b956c 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -1,4 +1,4 @@
-import paddle.v2.fluid.layers as layers
+import layers
 
 __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
 
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index d2841df6af7a0d860c239db952c767c995d30ba4..934e024742fd00bf05cc0d7caaaa870c18a68074 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -1,16 +1,13 @@
 from collections import defaultdict
 
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.framework import unique_name, Program
-from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.initializer import ConstantInitializer
-from paddle.v2.fluid.regularizer import append_regularization_ops
-from paddle.v2.fluid.layer_helper import LayerHelper
+import framework
+from backward import append_backward_ops
+from framework import unique_name
+from initializer import Constant
+from layer_helper import LayerHelper
+from regularizer import append_regularization_ops
 
-__all__ = [
-    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
-    'AdamaxOptimizer', 'DecayedAdagradOptimizer'
-]
+__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
 
 
 class Optimizer(object):
@@ -48,7 +45,7 @@ class Optimizer(object):
             persistable=True)
         param_lr = param_lr * self._learning_rate
         self.helper.set_variable_initializer(
-            var=param_lr_var, initializer=ConstantInitializer(param_lr))
+            var=param_lr_var, initializer=Constant(param_lr))
         return param_lr_var
 
     def _create_accumulators(self, block, parameters):
@@ -92,11 +89,11 @@ class Optimizer(object):
         var = self.helper.create_global_variable(
             name=unique_name(name),
             persistable=True,
-            dtype=dtype or param.data_type,
+            dtype=dtype or param.dtype,
             type=param.type,
             shape=param.shape)
         self.helper.set_variable_initializer(
-            var, initializer=ConstantInitializer(value=float(fill_value)))
+            var, initializer=Constant(value=float(fill_value)))
         self._accumulators[name][param.name] = var
 
     def _get_accumulator(self, name, param):
@@ -170,7 +167,8 @@ class Optimizer(object):
 
         optimize_ops = []
         for param_and_grad in parameters_and_grads:
-            if param_and_grad[1] is not None:
+            if param_and_grad[0].trainable is True and param_and_grad[
+                    1] is not None:
                 optimize_op = self._append_optimize_op(loss.block,
                                                        param_and_grad)
                 optimize_ops.append(optimize_op)
@@ -201,7 +199,7 @@ class Optimizer(object):
         """
         params_grads = append_backward_ops(loss, parameter_list, no_grad_set or
                                            set())
-        # Add regularization if any 
+        # Add regularization if any
         params_grads = append_regularization_ops(params_grads)
         optimize_ops = self.create_optimization_pass(params_grads, loss,
                                                      startup_program)
@@ -359,7 +357,7 @@ class AdamOptimizer(Optimizer):
             lod_level=0,
             persistable=True)
         self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         self._beta2_pow_acc = self.helper.create_global_variable(
             name=unique_name('beta2_pow_acc'),
@@ -369,7 +367,7 @@ class AdamOptimizer(Optimizer):
             persistable=True)
 
         self.helper.set_variable_initializer(
-            self._beta2_pow_acc, initializer=ConstantInitializer(self._beta2))
+            self._beta2_pow_acc, initializer=Constant(self._beta2))
 
         # Create accumulator tensors for first and second moments
         for p in parameters:
@@ -461,7 +459,7 @@ class AdamaxOptimizer(Optimizer):
             lod_level=0,
             persistable=True)
         self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         # Create accumulator tensors for first moment and infinity norm
         for p in parameters:
@@ -558,3 +556,19 @@ class DecayedAdagradOptimizer(Optimizer):
             attrs={"epsilon": self._epsilon})
 
         return decayed_adagrad_op
+
+
+# We short the class name, since users will use the optimizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# sgd = fluid.optimizer.SGD(...)
+#
+# It is no need to add an `Optimizer` as the class suffix
+SGD = SGDOptimizer
+Momentum = MomentumOptimizer
+Adagrad = AdagradOptimizer
+Adam = AdamOptimizer
+Adamax = AdamaxOptimizer
+DecayedAdagrad = DecayedAdagradOptimizer
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..86088fdd7ce17b8b7a9688dc838e69b2aa754013
--- /dev/null
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -0,0 +1,61 @@
+from initializer import Initializer, Xavier, Constant
+from regularizer import WeightDecayRegularizer
+
+
+class ParamAttr(object):
+    def __init__(self,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True):
+        self.name = name
+        self.initializer = initializer
+        self.learning_rate = learning_rate
+        self.regularizer = regularizer
+        self.trainable = trainable
+
+    def set_default_initializer(self, initializer):
+        if initializer is None:
+            if self.initializer is None:
+                raise ValueError("ParamAttr.initializer is not set")
+            return
+
+        if self.initializer is not None:
+            return
+
+        self.initializer = initializer
+
+    def set_default_param_initializer(self):
+        self.set_default_initializer(Xavier())
+
+    def set_default_bias_initializer(self):
+        self.set_default_initializer(Constant(0.0))
+
+    @staticmethod
+    def to_attr(arg):
+        if arg is None:
+            return ParamAttr()
+        elif isinstance(arg, ParamAttr):
+            return arg
+        elif isinstance(arg, str) or isinstance(arg, unicode):
+            return ParamAttr(name=arg)
+        elif isinstance(arg, Initializer):
+            return ParamAttr(initializer=arg)
+        elif isinstance(arg, WeightDecayRegularizer):
+            return ParamAttr(regularizer=arg)
+        elif isinstance(arg, bool):
+            return ParamAttr.to_attr(None) if arg else False
+        else:
+            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
+
+    def to_kwargs(self, with_initializer=False):
+        kwargs = {
+            'name': self.name,
+            'learning_rate': self.learning_rate,
+            'regularizer': self.regularizer,
+            'trainable': self.trainable
+        }
+        if with_initializer:
+            kwargs['initializer'] = self.initializer
+        return kwargs
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2069b713faf41c5c00ceaf47e030864b98c678da
--- /dev/null
+++ b/python/paddle/v2/fluid/profiler.py
@@ -0,0 +1,46 @@
+import paddle.v2.fluid.core as core
+from contextlib import contextmanager
+
+__all__ = ['CudaProfiler']
+
+NVPROF_CONFIG = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+@contextmanager
+def cuda_profiler(output_file, output_mode=None, config=None):
+    """The CUDA profiler.
+    This fuctions is used to profile CUDA program by CUDA runtime application
+    programming interface. The profiling result will be written into
+    `output_file` with Key-Value pair format or Comma separated values format.
+    The user can set the output mode by `output_mode` argument and set the
+    counters/options for profiling by `config` argument. The default config
+    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
+    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+
+    Args:
+        output_file (string) : The output file name, the result will be
+            written into this file.
+        output_mode (string) : The output mode has Key-Value pair format and
+            Comma separated values format. It should be 'kvp' or 'csv'.
+        config (string) : The profiler options and counters can refer to
+            "Compute Command Line Profiler User Guide".
+    """
+    if output_mode is None:
+        output_mode = 'csv'
+    if output_mode not in ['kvp', 'csv']:
+        raise ValueError("The output mode must be 'kvp' or 'csv'.")
+    config = NVPROF_CONFIG if config is None else config
+    core.nvprof_init(output_file, output_mode, config)
+    # Enables profiler collection by the active CUDA profiling tool.
+    core.nvprof_start()
+    yield
+    # Disables profiler collection.
+    core.nvprof_stop()
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
index 098cd0dd6439554f49e429ab75fb11bfa2c9d28c..c2c18e1951234f7160ff9f92d6dd6922a56683dd 100644
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -1,8 +1,6 @@
-import paddle.v2.fluid.framework as framework
+import framework
 
-__all__ = [
-    'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'
-]
+__all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay']
 
 
 def append_regularization_ops(parameters_and_grads):
@@ -139,3 +137,16 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             attrs={"scale": self._regularization_coeff})
 
         return decay
+
+
+# We short the class name, since users will use the regulaizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=ParamAttr(fluid.regularizer.Xavier()))
+#
+# It is no need to add a `Regularizer` as the class suffix
+L1Decay = L1DecayRegularizer
+L2Decay = L2DecayRegularizer
diff --git a/python/paddle/v2/fluid/tests/.gitignore b/python/paddle/v2/fluid/tests/.gitignore
index fcc52c04886865d96c1bfe1597a9dc99c181de1f..a648f2b387c2c7b9422eea6749e43e7b8871f60f 100644
--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
@@ -1,2 +1,3 @@
 image/
 fit_a_line.model/
+tmp
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index 4d7664469e481344cf9eea84688f068b4fb99dee..a35abe3e0c436be4eaed01c9b9183344c6d3b275 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,5 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_image_classification_train)
+py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
+py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+
+# default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index ee677a2c5670a092c509b9ce1c555223bf22957f..9f98493adb21a03b8efde0f88c490e77c9d303e7 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -1,34 +1,18 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.io import save_persistables, load_persistables
-from paddle.v2.fluid.executor import Executor
-
 import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
 
-x = layers.data(
-    name='x',
-    shape=[13],
-    data_type='float32')
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
-y_predict = layers.fc(input=x,
-                      size=1,
-                      act=None)
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
 
-y = layers.data(
-    name='y',
-    shape=[1],
-    data_type='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-cost = layers.square_error_cost(
-    input=y_predict,
-    label=y)
-avg_cost = layers.mean(x=cost)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
 
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
 
 BATCH_SIZE = 20
 
@@ -37,32 +21,24 @@ train_reader = paddle.batch(
         paddle.dataset.uci_housing.train(), buf_size=500),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
-    save_persistables(exe, "./fit_a_line.model/")
-    load_persistables(exe, "./fit_a_line.model/")
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
     for data in train_reader():
-        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("float32")
-
-        tensor_x = core.LoDTensor()
-        tensor_x.set(x_data, place)
-        # print tensor_x.get_dims()
+        x_data = np.array(map(lambda _: _[0], data)).astype("float32")
+        y_data = np.array(map(lambda _: _[1], data)).astype("float32")
 
-        tensor_y = core.LoDTensor()
-        tensor_y.set(y_data, place)
-        # print tensor_y.get_dims()
-        outs = exe.run(framework.default_main_program(),
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost])
-        out = np.array(outs[0])
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed={'x': x_data,
+                                        'y': y_data},
+                                  fetch_list=[avg_cost])
 
-        if out[0] < 10.0:
+        if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index f4be835b3ad57d5b0076e8a816c2c3def46e0663..0f0cc5b5406ef51ac3504a95ea716056ae8730af 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,22 +1,14 @@
+from __future__ import print_function
+
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.initializer import XavierInitializer
+import paddle.v2.fluid as fluid
+import sys
 
 
 def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu'):
-        tmp = layers.conv2d(
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
             input=input,
             filter_size=filter_size,
             num_filters=ch_out,
@@ -24,39 +16,19 @@ def resnet_cifar10(input, depth=32):
             padding=padding,
             act=None,
             bias_attr=False)
-        return layers.batch_norm(
-            input=tmp,
-            act=act)
+        return fluid.layers.batch_norm(input=tmp, act=act)
 
-    def shortcut(input, ch_in, ch_out, stride, program, init_program):
+    def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None, program,
-                                 init_program)
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
         else:
             return input
 
-    def basicblock(input,
-                   ch_in,
-                   ch_out,
-                   stride):
-        tmp = conv_bn_layer(
-            input,
-            ch_out,
-            3,
-            stride,
-            1)
-        tmp = conv_bn_layer(
-            tmp,
-            ch_out,
-            3,
-            1,
-            1,
-            act=None)
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
         short = shortcut(input, ch_in, ch_out, stride)
-        return layers.elementwise_add(
-            x=tmp,
-            y=short,
-            act='relu')
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
 
     def layer_warp(block_func, input, ch_in, ch_out, count, stride):
         tmp = block_func(input, ch_in, ch_out, stride)
@@ -67,46 +39,18 @@ def resnet_cifar10(input, depth=32):
     assert (depth - 2) % 6 == 0
     n = (depth - 2) / 6
     conv1 = conv_bn_layer(
-        input=input,
-        ch_out=16,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    res1 = layer_warp(
-        basicblock,
-        conv1,
-        16,
-        16,
-        n,
-        1)
-    res2 = layer_warp(
-        basicblock,
-        res1,
-        16,
-        32,
-        n,
-        2)
-    res3 = layer_warp(
-        basicblock,
-        res2,
-        32,
-        64,
-        n,
-        2)
-    pool = layers.pool2d(
-        input=res3,
-        pool_size=8,
-        pool_type='avg',
-        pool_stride=1)
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
     return pool
 
 
 def vgg16_bn_drop(input):
-    def conv_block(input,
-                   num_filter,
-                   groups,
-                   dropouts):
-        return nets.img_conv_group(
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
             input=input,
             pool_size=2,
             pool_stride=2,
@@ -123,52 +67,42 @@ def vgg16_bn_drop(input):
     conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
     conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
 
-    drop = layers.dropout(
-        x=conv5,
-        dropout_prob=0.5)
-    fc1 = layers.fc(input=drop,
-                    size=512,
-                    act=None,
-                    param_attr={"initializer": XavierInitializer()})
-    reshape1 = layers.reshape(
-        x=fc1,
-        shape=list(fc1.shape + (1, 1)))
-    bn = layers.batch_norm(
-        input=reshape1,
-        act='relu')
-    drop2 = layers.dropout(
-        x=bn,
-        dropout_prob=0.5)
-    fc2 = layers.fc(input=drop2,
-                    size=512,
-                    act=None,
-                    param_attr={"initializer": XavierInitializer()})
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
     return fc2
 
 
 classdim = 10
 data_shape = [3, 32, 32]
 
-images = layers.data(name='pixel', shape=data_shape, data_type='float32')
-label = layers.data(name='label', shape=[1], data_type='int64')
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-# Add neural network config
-# option 1. resnet
-# net = resnet_cifar10(images, 32)
-# option 2. vgg
-net = vgg16_bn_drop(images)
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
 
-# print(program)
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
 
-predict = layers.fc(input=net, size=classdim, act='softmax')
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
-accuracy = layers.accuracy(input=predict, label=label)
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
 
-# optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-optimizer = optimizer.AdamOptimizer(learning_rate=0.001)
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
 
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
 BATCH_SIZE = 128
 PASS_NUM = 1
 
@@ -177,13 +111,13 @@ train_reader = paddle.batch(
         paddle.dataset.cifar.train10(), buf_size=128 * 10),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
-    batch_id = 0
+    accuracy.reset(exe)
     for data in train_reader():
         img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                 data)).astype("float32")
@@ -193,23 +127,13 @@ for pass_id in range(PASS_NUM):
             batch_size = batch_size * i
         y_data = y_data.reshape([batch_size, 1])
 
-        tensor_img = core.LoDTensor()
-        tensor_y = core.LoDTensor()
-        tensor_img.set(img_data, place)
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={"pixel": tensor_img,
-                             "label": tensor_y},
-                       fetch_list=[avg_cost, accuracy])
-
-        loss = np.array(outs[0])
-        acc = np.array(outs[1])
-        print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
-              " loss:" + str(loss) + " acc:" + str(acc))
-        batch_id = batch_id + 1
-
-        if batch_id > 1:
-            # this model is slow, so if we can train two mini batch, we think it works properly.
-            exit(0)
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed={"pixel": img_data,
+                                  "label": y_data},
+                            fetch_list=[avg_cost] + accuracy.metrics)
+        pass_acc = accuracy.eval(exe)
+        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+            pass_acc))
+        # this model is slow, so if we can train two mini batch, we think it works properly.
+        exit(0)
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcd6f4d6bc66fd01406332bd1d6d7a5c4b0ddb5a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -0,0 +1,188 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 20
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm():
+    # 8 features
+    word = fluid.layers.data(name='word_data', shape=[1], dtype='int64')
+    predicate = fluid.layers.data(name='verb_data', shape=[1], dtype='int64')
+    ctx_n2 = fluid.layers.data(name='ctx_n2_data', shape=[1], dtype='int64')
+    ctx_n1 = fluid.layers.data(name='ctx_n1_data', shape=[1], dtype='int64')
+    ctx_0 = fluid.layers.data(name='ctx_0_data', shape=[1], dtype='int64')
+    ctx_p1 = fluid.layers.data(name='ctx_p1_data', shape=[1], dtype='int64')
+    ctx_p2 = fluid.layers.data(name='ctx_p2_data', shape=[1], dtype='int64')
+    mark = fluid.layers.data(name='mark_data', shape=[1], dtype='int64')
+
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    feature_out = db_lstm()
+    target = fluid.layers.data(name='target', shape=[1], dtype='int64')
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
+    # TODO(qiao)
+    #   1. add crf_decode_layer and evaluator
+    #   2. use other optimizer and check why out will be NAN
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    sgd_optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    embedding_param = fluid.g_scope.find_var(embedding_name).get_tensor()
+    embedding_param.set(
+        load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
+
+    batch_id = 0
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place)
+            ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place)
+            ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place)
+            ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place)
+            ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place)
+            verb_data = to_lodtensor(map(lambda x: x[6], data), place)
+            mark_data = to_lodtensor(map(lambda x: x[7], data), place)
+            target = to_lodtensor(map(lambda x: x[8], data), place)
+
+            outs = exe.run(fluid.default_main_program(),
+                           feed={
+                               'word_data': word_data,
+                               'ctx_n2_data': ctx_n2_data,
+                               'ctx_n1_data': ctx_n1_data,
+                               'ctx_0_data': ctx_0_data,
+                               'ctx_p1_data': ctx_p1_data,
+                               'ctx_p2_data': ctx_p2_data,
+                               'verb_data': verb_data,
+                               'mark_data': mark_data,
+                               'target': target
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+
+            if batch_id % 10 == 0:
+                print("avg_cost=" + str(avg_cost_val))
+
+            # exit early for CI
+            exit(0)
+
+            batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
index f330ff58137068e429008bc7aa07bbc8d2e35ac4..ba686b56f8603834c12f5ed24e0ef7308c78585d 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
@@ -1,30 +1,18 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
-import paddle.v2.fluid.evaluator as evaluator
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.executor import Executor
-
+from __future__ import print_function
 import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
 
-images = layers.data(
-    name='pixel',
-    shape=[1, 28, 28],
-    data_type='float32')
-label = layers.data(
-    name='label',
-    shape=[1],
-    data_type='int64')
-conv_pool_1 = nets.simple_img_conv_pool(
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
     input=images,
     filter_size=5,
     num_filters=20,
     pool_size=2,
     pool_stride=2,
     act="relu")
-conv_pool_2 = nets.simple_img_conv_pool(
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
     input=conv_pool_1,
     filter_size=5,
     num_filters=50,
@@ -32,17 +20,13 @@ conv_pool_2 = nets.simple_img_conv_pool(
     pool_stride=2,
     act="relu")
 
-predict = layers.fc(input=conv_pool_2,
-                    size=10,
-                    act="softmax")
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
-optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
-opts = optimizer.minimize(avg_cost)
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
 
-accuracy, acc_out = evaluator.accuracy(
-    input=predict,
-    label=label)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
 BATCH_SIZE = 50
 PASS_NUM = 3
@@ -51,13 +35,12 @@ train_reader = paddle.batch(
         paddle.dataset.mnist.train(), buf_size=500),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
-    count = 0
     accuracy.reset(exe)
     for data in train_reader():
         img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
@@ -65,25 +48,19 @@ for pass_id in range(PASS_NUM):
         y_data = np.array(map(lambda x: x[1], data)).astype("int64")
         y_data = y_data.reshape([BATCH_SIZE, 1])
 
-        tensor_img = core.LoDTensor()
-        tensor_y = core.LoDTensor()
-        tensor_img.set(img_data, place)
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={"pixel": tensor_img,
-                             "label": tensor_y},
-                       fetch_list=[avg_cost, acc_out])
-        loss = np.array(outs[0])
-        acc = np.array(outs[1])
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed={"pixel": img_data,
+                                  "label": y_data},
+                            fetch_list=[avg_cost] + accuracy.metrics)
         pass_acc = accuracy.eval(exe)
-        print "pass id : ", pass_id, pass_acc
+        print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
+              str(pass_acc))
         # print loss, acc
-        if loss < 10.0 and acc > 0.9:
+        if loss < 10.0 and pass_acc > 0.9:
             # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
             exit(0)
 
     pass_acc = accuracy.eval(exe)
-    print "pass id : ", pass_id, pass_acc
+    print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
 
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index b0164e3e3659c19edf2af45e706fb48ac1fe2b1c..fa18965aac667c0829b9e6ee56ece585564f9060 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -1,84 +1,94 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.regularizer import L2DecayRegularizer
-from paddle.v2.fluid.initializer import UniformInitializer
-
+from __future__ import print_function
 import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
 
 BATCH_SIZE = 128
-image = layers.data(
-    name='x',
-    shape=[784],
-    data_type='float32')
-
-param_attr = {
-    'name': None,
-    'initializer': UniformInitializer(
-        low=-1.0, high=1.0),
-    'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE)
-}
-
-hidden1 = layers.fc(input=image,
-                    size=128,
-                    act='relu',
-                    param_attr=param_attr)
-hidden2 = layers.fc(input=hidden1,
-                    size=64,
-                    act='relu',
-                    param_attr=param_attr)
-
-predict = layers.fc(input=hidden2,
-                    size=10,
-                    act='softmax',
-                    param_attr=param_attr)
-
-label = layers.data(
-    name='y',
-    shape=[1],
-    data_type='int64')
-
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
-accuracy = layers.accuracy(
-    input=predict,
-    label=label)
-
-optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+
+regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
+
+hidden1 = fluid.layers.fc(input=image,
+                          size=128,
+                          act='relu',
+                          param_attr=regularizer)
+hidden2 = fluid.layers.fc(input=hidden1,
+                          size=64,
+                          act='relu',
+                          param_attr=regularizer)
+
+predict = fluid.layers.fc(input=hidden2,
+                          size=10,
+                          act='softmax',
+                          param_attr=regularizer)
+
+label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
 opts = optimizer.minimize(avg_cost)
 
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+inference_program = fluid.default_main_program().clone()
+test_accuracy = fluid.evaluator.Accuracy(
+    input=predict, label=label, main_program=inference_program)
+test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+inference_program = fluid.io.get_inference_program(
+    test_target, main_program=inference_program)
+
 train_reader = paddle.batch(
     paddle.reader.shuffle(
         paddle.dataset.mnist.train(), buf_size=8192),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
+    accuracy.reset(exe)
     for data in train_reader():
         x_data = np.array(map(lambda x: x[0], data)).astype("float32")
         y_data = np.array(map(lambda x: x[1], data)).astype("int64")
         y_data = np.expand_dims(y_data, axis=1)
 
-        tensor_x = core.LoDTensor()
+        tensor_x = fluid.LoDTensor()
         tensor_x.set(x_data, place)
 
-        tensor_y = core.LoDTensor()
+        tensor_y = fluid.LoDTensor()
         tensor_y.set(y_data, place)
 
-        outs = exe.run(framework.default_main_program(),
+        outs = exe.run(fluid.default_main_program(),
                        feed={'x': tensor_x,
                              'y': tensor_y},
-                       fetch_list=[avg_cost, accuracy])
+                       fetch_list=[avg_cost] + accuracy.metrics)
         out = np.array(outs[0])
         acc = np.array(outs[1])
-        if out[0] < 5.0:
-            exit(0)  # if avg cost less than 5.0, we think our code is good.
+        pass_acc = accuracy.eval(exe)
+
+        test_accuracy.reset(exe)
+        for data in test_reader():
+            x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = np.expand_dims(y_data, axis=1)
+
+            out, acc = exe.run(inference_program,
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_cost] + test_accuracy.metrics)
+
+        test_pass_acc = test_accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " train_cost=" + str(
+            out) + " train_acc=" + str(acc) + " train_pass_acc=" + str(pass_acc)
+              + " test_acc=" + str(test_pass_acc))
+
+        if test_pass_acc > 0.7:
+            exit(0)
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index eefcb55bebff41eb9c67d9f0c8e83a5f1d4599bd..db91ca4f9c7d17fb51fc5d65a0464e976d98523c 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -1,12 +1,11 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.executor import Executor
-
-import numpy as np
+from paddle.v2.fluid.optimizer import SGDOptimizer
 
 IS_SPARSE = True
 USE_GPU = False
@@ -19,74 +18,55 @@ def get_usr_combined_features():
 
     USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
 
-    uid = layers.data(
-        name='user_id',
-        shape=[1],
-        data_type='int64')
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
 
     usr_emb = layers.embedding(
         input=uid,
-        data_type='float32',
+        dtype='float32',
         size=[USR_DICT_SIZE, 32],
-        param_attr={'name': 'user_table'},
+        param_attr='user_table',
         is_sparse=IS_SPARSE)
 
-    usr_fc = layers.fc(input=usr_emb,
-                       size=32)
+    usr_fc = layers.fc(input=usr_emb, size=32)
 
     USR_GENDER_DICT_SIZE = 2
 
-    usr_gender_id = layers.data(
-        name='gender_id',
-        shape=[1],
-        data_type='int64')
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
 
     usr_gender_emb = layers.embedding(
         input=usr_gender_id,
         size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr={'name': 'gender_table'},
+        param_attr='gender_table',
         is_sparse=IS_SPARSE)
 
-    usr_gender_fc = layers.fc(input=usr_gender_emb,
-                              size=16)
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
 
     USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(
-        name='age_id',
-        shape=[1],
-        data_type="int64")
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
 
     usr_age_emb = layers.embedding(
         input=usr_age_id,
         size=[USR_AGE_DICT_SIZE, 16],
         is_sparse=IS_SPARSE,
-        param_attr={'name': 'age_table'})
+        param_attr='age_table')
 
-    usr_age_fc = layers.fc(input=usr_age_emb,
-                           size=16)
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
 
     USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(
-        name='job_id',
-        shape=[1],
-        data_type="int64")
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
 
     usr_job_emb = layers.embedding(
         input=usr_job_id,
         size=[USR_JOB_DICT_SIZE, 16],
-        param_attr={'name': 'job_table'},
+        param_attr='job_table',
         is_sparse=IS_SPARSE)
 
-    usr_job_fc = layers.fc(input=usr_job_emb,
-                           size=16)
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
 
     concat_embed = layers.concat(
-        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
-        axis=1)
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
 
-    usr_combined_features = layers.fc(input=concat_embed,
-                                      size=200,
-                                      act="tanh")
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
 
     return usr_combined_features
 
@@ -95,48 +75,33 @@ def get_mov_combined_features():
 
     MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
 
-    mov_id = layers.data(
-        name='movie_id',
-        shape=[1],
-        data_type='int64')
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
 
     mov_emb = layers.embedding(
         input=mov_id,
-        data_type='float32',
+        dtype='float32',
         size=[MOV_DICT_SIZE, 32],
-        param_attr={'name': 'movie_table'},
+        param_attr='movie_table',
         is_sparse=IS_SPARSE)
 
-    mov_fc = layers.fc(input=mov_emb,
-                       size=32)
+    mov_fc = layers.fc(input=mov_emb, size=32)
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
-    category_id = layers.data(
-        name='category_id',
-        shape=[1],
-        data_type='int64')
+    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
 
     mov_categories_emb = layers.embedding(
-        input=category_id,
-        size=[CATEGORY_DICT_SIZE, 32],
-        is_sparse=IS_SPARSE)
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
 
     mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb,
-        pool_type="sum")
+        input=mov_categories_emb, pool_type="sum")
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
-    mov_title_id = layers.data(
-        name='movie_title',
-        shape=[1],
-        data_type='int64')
+    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
 
     mov_title_emb = layers.embedding(
-        input=mov_title_id,
-        size=[MOV_TITLE_DICT_SIZE, 32],
-        is_sparse=IS_SPARSE)
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
 
     mov_title_conv = nets.sequence_conv_pool(
         input=mov_title_emb,
@@ -146,13 +111,10 @@ def get_mov_combined_features():
         pool_type="sum")
 
     concat_embed = layers.concat(
-        input=[mov_fc, mov_categories_hidden, mov_title_conv],
-        axis=1)
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
 
     # FIXME(dzh) : need tanh operator
-    mov_combined_features = layers.fc(input=concat_embed,
-                                      size=200,
-                                      act="tanh")
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
 
     return mov_combined_features
 
@@ -162,18 +124,11 @@ def model():
     mov_combined_features = get_mov_combined_features()
 
     # need cos sim
-    inference = layers.cos_sim(
-        X=usr_combined_features,
-        Y=mov_combined_features)
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
 
-    label = layers.data(
-        name='score',
-        shape=[1],
-        data_type='float32')
+    label = layers.data(name='score', shape=[1], dtype='float32')
 
-    square_cost = layers.square_error_cost(
-        input=inference,
-        label=label)
+    square_cost = layers.square_error_cost(input=inference, label=label)
 
     avg_cost = layers.mean(x=square_cost)
 
@@ -182,7 +137,7 @@ def model():
 
 def main():
     cost = model()
-    sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
+    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
     opts = sgd_optimizer.minimize(cost)
 
     if USE_GPU:
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
index 91fc79a9870a31205098d8a40de6c033d5bf60b9..be875a952b7086ee64984525d70ffd3f1ecb5fae 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -1,40 +1,35 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.executor import Executor
-
+from __future__ import print_function
 import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
 
 
 def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
-    data = layers.data(name="words", shape=[1], data_type="int64")
-    label = layers.data(name="label", shape=[1], data_type="int64")
+    data = fluid.layers.data(name="words", shape=[1], dtype="int64")
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = nets.sequence_conv_pool(
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=3,
         act="tanh",
         pool_type="sqrt")
-    conv_4 = nets.sequence_conv_pool(
+    conv_4 = fluid.nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=4,
         act="tanh",
         pool_type="sqrt")
-    prediction = layers.fc(input=[conv_3, conv_4],
-                           size=class_dim,
-                           act="softmax")
-    cost = layers.cross_entropy(input=prediction, label=label)
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    acc = layers.accuracy(input=prediction, label=label)
-    return avg_cost, acc
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
 
 
 def to_lodtensor(data, place):
@@ -46,7 +41,7 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
@@ -60,36 +55,38 @@ def main():
     dict_dim = len(word_dict)
     class_dim = 2
 
-    cost, acc = convolution_net(input_dim=dict_dim, class_dim=class_dim)
+    cost, accuracy, acc_out = convolution_net(
+        input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=1000),
         batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
 
-    exe.run(framework.default_startup_program())
+    exe.run(fluid.default_startup_program())
 
     for pass_id in xrange(PASS_NUM):
+        accuracy.reset(exe)
         for data in train_data():
             tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
 
             label = np.array(map(lambda x: x[1], data)).astype("int64")
             label = label.reshape([BATCH_SIZE, 1])
 
-            tensor_label = core.LoDTensor()
+            tensor_label = fluid.LoDTensor()
             tensor_label.set(label, place)
 
-            outs = exe.run(framework.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
-
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            if cost_val < 1.0 and acc_val > 0.7:
+            cost_val, acc_val = exe.run(
+                fluid.default_main_program(),
+                feed={"words": tensor_words,
+                      "label": tensor_label},
+                fetch_list=[cost, acc_out])
+            pass_acc = accuracy.eval(exe)
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                  " pass_acc=" + str(pass_acc))
+            if cost_val < 1.0 and pass_acc > 0.8:
                 exit(0)
     exit(1)
 
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
index 8c3d4488354eb363cd1d378ebd4cb8069e7c1b1d..094a3cdcda12eaee351476e99a388c44b3c81cd6 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -1,12 +1,6 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.executor import Executor
-
 import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
 
 
 def stacked_lstm_net(input_dim,
@@ -15,36 +9,36 @@ def stacked_lstm_net(input_dim,
                      hid_dim=512,
                      stacked_num=3):
     assert stacked_num % 2 == 1
-    data = layers.data(name="words", shape=[1], data_type="int64")
-    label = layers.data(name="label", shape=[1], data_type="int64")
+    data = fluid.layers.data(name="words", shape=[1], dtype="int64")
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
     # add bias attr
 
     # TODO(qijun) linear act
-    fc1 = layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = layers.dynamic_lstm(input=fc1, size=hid_dim)
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
 
     inputs = [fc1, lstm1]
 
     for i in range(2, stacked_num + 1):
-        fc = layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = layers.dynamic_lstm(
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
             input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
         inputs = [fc, lstm]
 
-    fc_last = layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = layers.sequence_pool(input=inputs[1], pool_type='max')
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
 
-    prediction = layers.fc(input=[fc_last, lstm_last],
-                           size=class_dim,
-                           act='softmax')
-    cost = layers.cross_entropy(input=prediction, label=label)
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    acc = layers.accuracy(input=prediction, label=label)
-    return avg_cost, acc
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
 
 
 def to_lodtensor(data, place):
@@ -56,7 +50,7 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
@@ -71,36 +65,38 @@ def main():
     dict_dim = len(word_dict)
     class_dim = 2
 
-    cost, acc = stacked_lstm_net(input_dim=dict_dim, class_dim=class_dim)
+    cost, accuracy, acc_out = stacked_lstm_net(
+        input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=1000),
         batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
 
-    exe.run(framework.default_startup_program())
+    exe.run(fluid.default_startup_program())
 
     for pass_id in xrange(PASS_NUM):
+        accuracy.reset(exe)
         for data in train_data():
             tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
 
             label = np.array(map(lambda x: x[1], data)).astype("int64")
             label = label.reshape([BATCH_SIZE, 1])
 
-            tensor_label = core.LoDTensor()
+            tensor_label = fluid.LoDTensor()
             tensor_label.set(label, place)
 
-            outs = exe.run(framework.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
-
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            if cost_val < 1.0 and acc_val > 0.7:
+            cost_val, acc_val = exe.run(
+                fluid.default_main_program(),
+                feed={"words": tensor_words,
+                      "label": tensor_label},
+                fetch_list=[cost, acc_out])
+            pass_acc = accuracy.eval(exe)
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                  " pass_acc=" + str(pass_acc))
+            if cost_val < 1.0 and acc_val > 0.8:
                 exit(0)
     exit(1)
 
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index a7d791c1f38d4843f084127e879d613b21ae8daf..b2479320330bde5771c3d4a8e2923b5ab1eecf2e 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -1,41 +1,39 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.executor import Executor
-
 import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
 
 
 def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
-    data = layers.data(
+    data = fluid.layers.data(
         name="words",
         shape=[seq_len * batch_size, 1],
         append_batch_size=False,
-        data_type="int64")
-    label = layers.data(
+        dtype="int64")
+    label = fluid.layers.data(
         name="label",
         shape=[batch_size, 1],
         append_batch_size=False,
-        data_type="int64")
+        dtype="int64")
 
-    emb = layers.embedding(input=data, size=[dict_dim, emb_dim])
-    emb = layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
-    emb = layers.transpose(x=emb, axis=[1, 0, 2])
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
+    emb = fluid.layers.transpose(x=emb, axis=[1, 0, 2])
 
-    c_pre_init = layers.fill_constant(
-        dtype=emb.data_type, shape=[batch_size, emb_dim], value=0.0)
-    layer_1_out = layers.lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
-    layer_1_out = layers.transpose(x=layer_1_out, axis=[1, 0, 2])
+    c_pre_init = fluid.layers.fill_constant(
+        dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
+    layer_1_out = fluid.layers.lstm(
+        emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])
 
-    prediction = layers.fc(input=layer_1_out, size=class_dim, act="softmax")
-    cost = layers.cross_entropy(input=prediction, label=label)
+    prediction = fluid.layers.fc(input=layer_1_out,
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
 
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    acc = layers.accuracy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
 
     return avg_cost, acc
 
@@ -49,57 +47,65 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
 
 
-def chop_data(data, chop_len=80, batch_len=50):
+def chop_data(data, chop_len=80, batch_size=50):
     data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
 
-    return data[:batch_len]
+    return data[:batch_size]
 
 
 def prepare_feed_data(data, place):
     tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
 
     label = np.array(map(lambda x: x[1], data)).astype("int64")
-    label = label.reshape([50, 1])
-    tensor_label = core.LoDTensor()
+    label = label.reshape([len(label), 1])
+    tensor_label = fluid.LoDTensor()
     tensor_label.set(label, place)
 
     return tensor_words, tensor_label
 
 
 def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
     word_dict = paddle.dataset.imdb.word_dict()
-    cost, acc = lstm_net(dict_dim=len(word_dict), class_dim=2)
+    print "load word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
 
-    batch_size = 100
     train_data = paddle.batch(
-        paddle.reader.buffered(
-            paddle.dataset.imdb.train(word_dict), size=batch_size * 10),
-        batch_size=batch_size)
-
-    data = chop_data(next(train_data()))
-
-    place = core.CPUPlace()
-    tensor_words, tensor_label = prepare_feed_data(data, place)
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    while True:
-        outs = exe.run(framework.default_main_program(),
-                       feed={"words": tensor_words,
-                             "label": tensor_label},
-                       fetch_list=[cost, acc])
-        cost_val = np.array(outs[0])
-        acc_val = np.array(outs[1])
-
-        print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-        if acc_val > 0.9:
-            break
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            chopped_data = chop_data(data)
+            tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
+
+            outs = exe.run(fluid.default_main_program(),
+                           feed={"words": tensor_words,
+                                 "label": tensor_label},
+                           fetch_list=[cost, acc])
+            cost_val = np.array(outs[0])
+            acc_val = np.array(outs[1])
+
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+            if acc_val > 0.7:
+                exit(0)
+    exit(1)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index 9dcb6f2fea06ea8cd061be4f148854408779f990..92d3629d42613e896e93e0149928b50940058169 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,11 +1,6 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.executor import Executor
-
 import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
 
 PASS_NUM = 100
 EMBED_SIZE = 32
@@ -17,79 +12,57 @@ IS_SPARSE = True
 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
 
-first_word = layers.data(
-    name='firstw',
-    shape=[1],
-    data_type='int64')
-second_word = layers.data(
-    name='secondw',
-    shape=[1],
-    data_type='int64')
-third_word = layers.data(
-    name='thirdw',
-    shape=[1],
-    data_type='int64')
-forth_word = layers.data(
-    name='forthw',
-    shape=[1],
-    data_type='int64')
-next_word = layers.data(
-    name='nextw',
-    shape=[1],
-    data_type='int64')
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
 
-embed_first = layers.embedding(
+embed_first = fluid.layers.embedding(
     input=first_word,
     size=[dict_size, EMBED_SIZE],
-    data_type='float32',
+    dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_second = layers.embedding(
+    param_attr='shared_w')
+embed_second = fluid.layers.embedding(
     input=second_word,
     size=[dict_size, EMBED_SIZE],
-    data_type='float32',
+    dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_third = layers.embedding(
+    param_attr='shared_w')
+embed_third = fluid.layers.embedding(
     input=third_word,
     size=[dict_size, EMBED_SIZE],
-    data_type='float32',
+    dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_forth = layers.embedding(
+    param_attr='shared_w')
+embed_forth = fluid.layers.embedding(
     input=forth_word,
     size=[dict_size, EMBED_SIZE],
-    data_type='float32',
+    dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 
-concat_embed = layers.concat(
-    input=[embed_first, embed_second, embed_third, embed_forth],
-    axis=1)
-hidden1 = layers.fc(input=concat_embed,
-                    size=HIDDEN_SIZE,
-                    act='sigmoid')
-predict_word = layers.fc(input=hidden1,
-                         size=dict_size,
-                         act='softmax')
-cost = layers.cross_entropy(
-    input=predict_word,
-    label=next_word)
-avg_cost = layers.mean(x=cost)
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost)
+concat_embed = fluid.layers.concat(
+    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
 
 train_reader = paddle.batch(
     paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
 # fix https://github.com/PaddlePaddle/Paddle/issues/5434 then remove
 # below exit line.
 exit(0)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
     for data in train_reader():
@@ -97,36 +70,15 @@ for pass_id in range(PASS_NUM):
         input_data = map(lambda x: np.array(x).astype("int64"), input_data)
         input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
 
-        first_data = input_data[0]
-        first_tensor = core.LoDTensor()
-        first_tensor.set(first_data, place)
-
-        second_data = input_data[1]
-        second_tensor = core.LoDTensor()
-        second_tensor.set(second_data, place)
-
-        third_data = input_data[2]
-        third_tensor = core.LoDTensor()
-        third_tensor.set(third_data, place)
-
-        forth_data = input_data[3]
-        forth_tensor = core.LoDTensor()
-        forth_tensor.set(forth_data, place)
-
-        next_data = input_data[4]
-        next_tensor = core.LoDTensor()
-        next_tensor.set(next_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={
-                           'firstw': first_tensor,
-                           'secondw': second_tensor,
-                           'thirdw': third_tensor,
-                           'forthw': forth_tensor,
-                           'nextw': next_tensor
-                       },
-                       fetch_list=[avg_cost])
-        out = np.array(outs[0])
-        if out[0] < 10.0:
+        avg_cost_np = exe.run(fluid.default_main_program(),
+                              feed={
+                                  'firstw': input_data[0],
+                                  'secondw': input_data[1],
+                                  'thirdw': input_data[2],
+                                  'forthw': input_data[3],
+                                  'nextw': input_data[4]
+                              },
+                              fetch_list=[avg_cost])
+        if avg_cost_np[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 90269e308a31d2606b23d741ce0d0fa91a0a6aeb..e83c4a0622013cbfebdf39434ef252412697acb1 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -261,7 +261,10 @@ class OpTest(unittest.TestCase):
         feed_map = self.feed_var(inputs, place)
 
         exe = Executor(place)
-        outs = exe.run(program, feed=feed_map, fetch_list=fetch_list)
+        outs = exe.run(program,
+                       feed=feed_map,
+                       fetch_list=fetch_list,
+                       return_numpy=False)
 
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
@@ -458,7 +461,7 @@ class OpTest(unittest.TestCase):
         mean_inputs = map(block.var, output_names)
 
         if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].data_type, shape=[1])
+            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
             op = block.append_op(
                 inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
             op.desc.infer_var_type(block.desc)
@@ -466,8 +469,7 @@ class OpTest(unittest.TestCase):
         else:
             avg_sum = []
             for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(
-                    dtype=cur_loss.data_type, shape=[1])
+                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
                 op = block.append_op(
                     inputs={"X": [cur_loss]},
                     outputs={"Out": [cur_avg_loss]},
@@ -476,13 +478,13 @@ class OpTest(unittest.TestCase):
                 op.desc.infer_shape(block.desc)
                 avg_sum.append(cur_avg_loss)
 
-            loss_sum = block.create_var(dtype=avg_sum[0].data_type, shape=[1])
+            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
             op_sum = block.append_op(
                 inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
             op_sum.desc.infer_var_type(block.desc)
             op_sum.desc.infer_shape(block.desc)
 
-            loss = block.create_var(dtype=loss_sum.data_type, shape=[1])
+            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
             op_loss = block.append_op(
                 inputs={"X": loss_sum},
                 outputs={"Out": loss},
@@ -501,5 +503,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        result = executor.run(prog, feed_dict, fetch_list)
-        return map(np.array, result)
+        return map(
+            np.array,
+            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
index 7649e60a3833e34523d87cb963af3888c3cef65d..bd52bef2605874d26e880fb09e589891fc1934d5 100644
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -152,6 +152,49 @@ class TestAbs(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
+class TestCeil(OpTest):
+    def setUp(self):
+        self.op_type = "ceil"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Y': np.ceil(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestFloor(OpTest):
+    def setUp(self):
+        self.op_type = "floor"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        # numpy floor need +1
+        self.outputs = {'Y': np.floor(self.inputs['X']) + 1.0}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestRound(OpTest):
+    def setUp(self):
+        self.op_type = "round"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Y': np.round(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
 class TestRelu(OpTest):
     def setUp(self):
         self.op_type = "relu"
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
index e019a4e15f0e25deaedf30911b44e576c8f89013..f6120aedecf1015c279b8f218f5e37f2e598ab91 100644
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
 
@@ -52,15 +52,13 @@ class TestArrayReadWrite(unittest.TestCase):
 
         exe = Executor(cpu)
 
-        tensor = core.LoDTensor()
-        tensor.set(numpy.random.random(size=(100, 100)).astype('float32'), cpu)
+        tensor = numpy.random.random(size=(100, 100)).astype('float32')
 
-        outs = map(numpy.array,
-                   exe.run(feed={'x0': tensor,
-                                 'x1': tensor,
-                                 'x2': tensor},
-                           fetch_list=[a_sum, x_sum],
-                           scope=scope))
+        outs = exe.run(feed={'x0': tensor,
+                             'x1': tensor,
+                             'x2': tensor},
+                       fetch_list=[a_sum, x_sum],
+                       scope=scope)
         self.assertEqual(outs[0], outs[1])
 
         total_sum = layers.sums(input=[a_sum, x_sum])
@@ -68,16 +66,15 @@ class TestArrayReadWrite(unittest.TestCase):
 
         append_backward_ops(total_sum_scaled)
 
-        g_vars = map(g_main_program.global_block().var,
+        g_vars = map(default_main_program().global_block().var,
                      [each_x.name + "@GRAD" for each_x in x])
         g_out = [
             item.sum()
-            for item in map(
-                numpy.array,
-                exe.run(feed={'x0': tensor,
-                              'x1': tensor,
-                              'x2': tensor},
-                        fetch_list=g_vars))
+            for item in exe.run(
+                feed={'x0': tensor,
+                      'x1': tensor,
+                      'x2': tensor},
+                fetch_list=g_vars)
         ]
         g_out_sum = numpy.array(g_out).sum()
 
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index 71f9599e0de83c86808f7e62547f80d3d50ffc7d..e766a68c0e338b07e47260e40edc544c98555382 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
 
 
 def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+
     if data_format == "NCHW":
         n, c, h, w = x.shape
         x_square = x * x
@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
+        if len(x_shape) == 2:
+            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         mean = x_sum / element_count
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
-        return (normalized * scale + offset), mean, var
+        y = normalized * scale + offset
+        if len(x_shape) == 2:
+            y = np.reshape(y, x_shape)
+        return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    x_shape = x.shape
+
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         grad_y = np.transpose(grad_y, (0, 2, 3, 1))
@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
         grad_x = np.transpose(grad_x, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
         grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+
+    if len(x_shape) == 2:
+        grad_x = np.reshape(grad_x, x_shape)
     return grad_x, grad_scale, grad_offset
 
 
@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
         momentum = 0.9
 
         # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 2
+        n, h, w, c = 2, 3, 4, 5
         x_shape = [n, h, w, c]
         scale_shape = [c]
 
@@ -184,20 +211,23 @@ class TestBatchNormOp(OpTest):
         print 'python: NHWC, NCHW, backward checking passed'
 
     def test_forward_backward(self):
-        def test_with_place(place, tensor_format):
+        def test_with_place(place, tensor_format, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
 
-            # N, H, W, C: 12, 3, 4, 2
-            n, h, w, c = 2, 3, 4, 2
-
-            if data_format == "NHWC":
-                x_shape = [n, h, w, c]
-            elif data_format == "NCHW":
-                x_shape = [n, c, h, w]
+            if len(shape) == 2:
+                x_shape = shape
+                c = shape[1]
             else:
-                raise ValueError("Unknown data type.")
+                # n, h, w, c = 2, 3, 4, 2
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+                if data_format == "NHWC":
+                    x_shape = [n, h, w, c]
+                elif data_format == "NCHW":
+                    x_shape = [n, c, h, w]
+                else:
+                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
             x_val = np.random.random_sample(x_shape).astype(np.float32)
@@ -219,7 +249,10 @@ class TestBatchNormOp(OpTest):
             #  for gradient test
             # y_grad = np.ones(x_shape).astype(np.float32)
             y_grad = np.zeros(x_shape).astype(np.float32)
-            y_grad[0, 0, 0, 0] = 1.
+            if len(y_grad.shape) == 2:
+                y_grad[0, 0] = 1.
+            else:
+                y_grad[0, 0, 0, 0] = 1.
             # y_grad = np.random.random_sample(x_shape).astype(np.float32)
             x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
                 x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
             places.append(core.GPUPlace(0))
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
-                test_with_place(place, data_format)
+                test_with_place(place, data_format, [2, 3, 4, 5])
+                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
index 8a11820d2aba2dd4d17d925f0e0fe9f324100418..5fad7d8cce5af3677aa77dc0abb64f1ecd380419 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
@@ -35,15 +35,15 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
         self.append_lod_tensor(
             scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
             np.array(
-                [1, 2, 3, 4, 5, 6], dtype="float32"))
+                [1, 2, 3, 4, 5, 6], dtype="float64"))
         self.append_lod_tensor(
             scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
             np.array(
-                [0, 1, 2, 3, 4, 5], dtype="float32"))
+                [0, 1, 2, 3, 4, 5], dtype="float64"))
         self.append_lod_tensor(
             scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
             np.array(
-                [0, 1, 2, 3, 4], dtype="float32"))
+                [0, 1, 2, 3, 4], dtype="float64"))
 
         sentence_ids = self.scope.var("sentence_ids").get_tensor()
         sentence_scores = self.scope.var("sentence_scores").get_tensor()
diff --git a/python/paddle/v2/fluid/tests/test_cast_op.py b/python/paddle/v2/fluid/tests/test_cast_op.py
index 0c4b6310652e84d3dd7f281a8b98ae0435072afb..4e431bb88da6070718d64a68467be20ca87f8fb9 100644
--- a/python/paddle/v2/fluid/tests/test_cast_op.py
+++ b/python/paddle/v2/fluid/tests/test_cast_op.py
@@ -10,8 +10,8 @@ class TestCastOp(op_test.OpTest):
         self.inputs = {'X': ipt.astype('float32')}
         self.outputs = {'Out': ipt.astype('float64')}
         self.attrs = {
-            'in_data_type': int(core.DataType.FP32),
-            'out_data_type': int(core.DataType.FP64)
+            'in_dtype': int(core.DataType.FP32),
+            'out_dtype': int(core.DataType.FP64)
         }
         self.op_type = 'cast'
 
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
index 293803f004a1513611fba30634d5552e1da84fef..2b9d8f351a2836cd723d629d4790de1e068d0ea3 100644
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -1,7 +1,7 @@
 import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
 import numpy
@@ -9,7 +9,7 @@ import numpy
 
 class ConditionalBlock(unittest.TestCase):
     def test_forward(self):
-        data = layers.data(name='X', shape=[1], data_type='float32')
+        data = layers.data(name='X', shape=[1], dtype='float32')
         data.stop_gradient = False
         cond = layers.ConditionalBlock(inputs=[data])
         out = layers.create_tensor(dtype='float32')
@@ -19,20 +19,19 @@ class ConditionalBlock(unittest.TestCase):
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)
-        exe.run(g_startup_program)
+        exe.run(default_startup_program())
 
-        x = core.LoDTensor()
-        x.set(numpy.random.random(size=(10, 1)).astype('float32'), cpu)
+        x = numpy.random.random(size=(10, 1)).astype('float32')
 
-        outs = map(numpy.array, exe.run(feed={'X': x}, fetch_list=[out]))[0]
+        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
         print outs
         loss = layers.mean(x=out)
         append_backward_ops(loss=loss)
-        outs = map(numpy.array,
-                   exe.run(feed={'X': x},
-                           fetch_list=[
-                               g_main_program.block(0).var(data.name + "@GRAD")
-                           ]))[0]
+        outs = exe.run(
+            feed={'X': x},
+            fetch_list=[
+                default_main_program().block(0).var(data.name + "@GRAD")
+            ])[0]
         print outs
 
 
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index 907b52c405d9e5c02c70f611e4c777ba21948c40..e82e3ab0c9c0bc75a13a8948fda925bc4f0b6512 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -16,8 +16,8 @@ def conv2d_forward_naive(input, filter, group, conv_param):
     out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
     out = np.zeros((in_n, out_c, out_h, out_w))
 
-    d_bolck_w = (dilation[0] * (f_h - 1) + 1)
-    d_bolck_h = (dilation[1] * (f_w - 1) + 1)
+    d_bolck_h = (dilation[0] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[1] * (f_w - 1) + 1)
 
     input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
                        mode='constant',
@@ -110,13 +110,30 @@ class TestConv2dOp(OpTest):
         self.op_type = "conv2d"
 
 
+class TestWithPad(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithStride(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
 class TestWithGroup(TestConv2dOp):
     def init_group(self):
         self.groups = 3
 
-    def init_op_type(self):
-        self.op_type = "conv2d"
-
 
 class TestWith1x1(TestConv2dOp):
     def init_test_case(self):
@@ -127,15 +144,9 @@ class TestWith1x1(TestConv2dOp):
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 1, 1]
 
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
     def init_group(self):
         self.groups = 3
 
-    def init_op_type(self):
-        self.op_type = "conv2d"
-
 
 class TestWithDilation(TestConv2dOp):
     def init_test_case(self):
@@ -152,26 +163,31 @@ class TestWithDilation(TestConv2dOp):
     def init_group(self):
         self.groups = 3
 
+
+#----------------Conv2dCudnn----------------
+class TestCudnn(TestConv2dOp):
     def init_op_type(self):
-        self.op_type = "conv2d"
+        self.op_type = "conv2d_cudnn"
 
 
-#----------------Conv2dCudnn----------------
+class TestCudnnWithPad(TestWithPad):
+    def init_op_type(self):
+        self.op_type = "conv2d_cudnn"
 
 
-class TestCudnn(TestConv2dOp):
+class TestCudnnWithStride(TestWithStride):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithGroup(TestWithGroup):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWith1x1(TestWith1x1):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.op_type = "conv2d_cudnn"
 
 
 #  cudnn v5 does not support dilation conv.
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
index 54349c018c4a53b8767d6cd4f94d99c719dc0237..d7b1f2f2a3abf6335998742dbbef8e17794170fa 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -4,9 +4,7 @@ from op_test import OpTest
 
 
 def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
-    # [2, 3, 5, 5]
     in_n, in_c, in_h, in_w = input_.shape
-    # [3, 6, 3, 3]
     f_c, out_c, f_h, f_w = filter_.shape
     assert in_c == f_c
 
@@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
                     j1, j2 = j * stride[0], j * stride[0] + f_w
                     out[n, k, i1:i2, j1:j2] += tmp_out
 
+    out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
 
 
@@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
         self.init_op_type()
-
-        # [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7]
         self.init_test_case()
 
         conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
@@ -55,7 +52,6 @@ class TestConv2dTransposeOp(OpTest):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        print 'check output here for', self.op_type
         self.check_output()
 
     def test_check_grad_no_input(self):
@@ -88,6 +84,26 @@ class TestConv2dTransposeOp(OpTest):
         self.op_type = "conv2d_transpose"
 
 
+class TestWithPad(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithStride(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
     def init_op_type(self):
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py
index 934ea46437d67b78309a86a2779e0c6577399136..8593dff20b5c283d5862206dfb0c0d2501039d07 100644
--- a/python/paddle/v2/fluid/tests/test_conv3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py
@@ -169,5 +169,31 @@ class TestWithDilation(TestConv3dOp):
         self.groups = 3
 
 
+class TestCudnn(TestConv3dOp):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWithGroup1Cudnn(TestWithGroup1):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWithGroup2Cudnn(TestWithGroup2):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWith1x1Cudnn(TestWith1x1):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+# FIXME(typhoonzero): find a way to determine if
+# using cudnn > 6 in python
+# class TestWithDilationCudnn(TestWithDilation):
+#     def init_op_type(self):
+#         self.op_type = "conv3d_cudnn"
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
index 132fe7931438a30cf02e4ad2894c0838e48ffc9f..8fd34b87bfea91307f52fdcbb9f71f2e1a9c6c56 100644
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -4,9 +4,7 @@ from op_test import OpTest
 
 
 def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
-    # [2, 3, 5, 5, 5]
     in_n, in_c, in_d, in_h, in_w = input_.shape
-    # [3, 6, 3, 3, 3]
     f_c, out_c, f_d, f_h, f_w = filter_.shape
     assert in_c == f_c
 
@@ -14,7 +12,6 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
     out_d = (in_d - 1) * stride[0] + f_d
     out_h = (in_h - 1) * stride[1] + f_h
     out_w = (in_w - 1) * stride[2] + f_w
-
     out = np.zeros((in_n, out_c, out_d, out_h, out_w))
 
     for n in range(in_n):
@@ -33,6 +30,8 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
                         j1, j2 = j * stride[2], j * stride[2] + f_w
                         out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
 
+    out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
+              pad[2]]
     return out
 
 
@@ -40,8 +39,6 @@ class TestConv3dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
         self.init_op_type()
-
-        # [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7]
         self.init_test_case()
 
         conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
@@ -49,7 +46,6 @@ class TestConv3dTransposeOp(OpTest):
         filter_ = np.random.random(self.filter_size).astype("float32")
         output = conv3dtranspose_forward_naive(
             input_, filter_, conv3dtranspose_param).astype("float32")
-        # print 'deconv output py', output, output.shape
 
         self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
@@ -60,7 +56,6 @@ class TestConv3dTransposeOp(OpTest):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        print 'check output here'
         self.check_output()
 
     def test_check_grad(self):
@@ -85,7 +80,7 @@ class TestConv3dTransposeOp(OpTest):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
-        self.input_size = [2, 3, 5, 5, 5]  # NCHW
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
 
@@ -93,5 +88,31 @@ class TestConv3dTransposeOp(OpTest):
         self.op_type = "conv3d_transpose"
 
 
+class TestWithPad(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithStride(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+# ------------ test_cudnn ------------
+class TestCudnn(TestConv3dTransposeOp):
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
index b14a366fcad7f4bf6968b6013c6cfbb57090071d..4f5ea836b44102e5599a2302efd669291ebe920b 100644
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -7,7 +7,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64)).astype('float32')
@@ -24,7 +24,7 @@ class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 1.0, 'is_test': False}
         self.outputs = {
             'Out': np.zeros((32, 64)).astype('float32'),
             'Mask': np.zeros((32, 64)).astype('float32')
@@ -35,7 +35,7 @@ class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64, 2)).astype('float32')
@@ -46,7 +46,7 @@ class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_training': False}
+        self.attrs = {'dropout_prob': 0.35, 'is_test': True}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
@@ -57,7 +57,7 @@ class TestDropoutOp5(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.75, 'is_training': False}
+        self.attrs = {'dropout_prob': 0.75, 'is_test': True}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
diff --git a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
deleted file mode 100644
index c2d8b48ea944ae40a451492b8e9fad38dda0835c..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import logging
-import paddle.v2.fluid.core as core
-import unittest
-from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
-import numpy as np
-
-# for siplicity, just one level LoD
-lod_py = [[0, 4, 7, 9, 10]]
-input_dim = 30
-num_sents = len(lod_py[0]) - 1
-weight_dim = 15
-
-
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class PyRNNStep(object):
-    def __init__(self):
-
-        self.x = np.random.normal(size=(lod_py[0][-1],
-                                        input_dim)).astype("float32")
-        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.h_boot = np.random.normal(size=(num_sents,
-                                             input_dim)).astype("float32")
-
-
-class DynamicRecurrentOpTest(unittest.TestCase):
-    '''
-    Test RNNOp
-
-    equation:
-        h_t = \sigma (W x_t + U h_{t-1})
-    weights:
-        - W
-        - U
-    vars:
-        - x
-    states:
-        - h
-    outputs:
-       - h
-    '''
-
-    py = PyRNNStep()
-
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_rnn_op()
-        self.create_step_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.rnnop.run(self.scope, ctx)
-        state = self.rnnop.get_state("h@state")
-        print 'state size: ', state.size()
-
-        step_inputs = self.rnnop.get_step_input("x")
-        print "x size ", step_inputs.size()
-        for i in range(step_inputs.size()):
-            print "x %d" % i, np.array(step_inputs.read(i).get_dims())
-        step_outputs = self.rnnop.get_step_output('h@state')
-        print 'step_outputs.size ', step_outputs.size()
-        output = self.scope.find_var("h@state").get_tensor()
-        print 'output', np.array(output).shape
-
-    def create_global_variables(self):
-        # create inlink
-        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
-                                 self.py.x)
-        x_tensor.set_lod(lod_py)
-        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
-        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
-        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
-                      self.py.h_boot)
-        self.scope.var("step_scopes")
-        self.scope.var("h@state")
-
-    def create_rnn_op(self):
-        # create RNNOp
-        self.rnnop = DynamicRecurrentOp(
-            # inputs
-            inputs=["x"],
-            initial_states=["h_boot"],
-            step_net="step_unit",
-            # outputs
-            outputs=["h@state"],
-            step_scopes="step_scopes",
-            # attributes
-            ex_states=["h@pre"],
-            states=["h@state"])
-
-    def create_step_net(self):
-        step_unit = core.Net.create()
-        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
-        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@state")
-
-        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            step_unit.append_op(op)
-        step_unit.complete_add_op(True)
-        self.rnnop.set_step_unit(step_unit)
-
-    def test_forward(self):
-        print 'test recurrent op forward'
-        pd_output = self.forward()
-        print 'pd_output', pd_output
-
-
-class RecurrentGradientOpTest(unittest.TestCase):
-    py = PyRNNStep()
-
-    def create_forward_op(self):
-        # create RNNOp
-        self.forward_op = DynamicRecurrentOp(
-            # inputs
-            inputs=["x"],
-            initial_states=["h_boot"],
-            step_net="step_unit",
-            # outputs
-            outputs=["h@state"],
-            step_scopes="step_scopes",
-            # attributes
-            ex_states=["h@pre"],
-            states=["h@state"])
-
-    def create_gradient_op(self):
-        a = set()
-        backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a)
-
-    def create_step_net(self):
-        step_unit = core.Net.create()
-        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
-        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@state")
-
-        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            step_unit.append_op(op)
-        step_unit.complete_add_op(True)
-        self.forward_op.set_step_unit(step_unit)
-
-    def create_global_variables(self):
-        # create inlink
-        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
-                                 self.py.x)
-        x_tensor.set_lod(lod_py)
-        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
-        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
-        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
-                      self.py.h_boot)
-        self.scope.var("step_scopes")
-        self.scope.var("h@state")
-
-    def test_grad(self):
-        self.scope = core.Scope()
-        self.create_forward_op()
-        self.create_global_variables()
-        self.create_step_net()
-        self.create_gradient_op()
-
-
-if __name__ == '__main__':
-    exit(
-        0
-    )  # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
index 709250d0c86dde84ac22c37d8e2385ca4a80a40a..b1ef87c5cb1711c419b401c5950839816f7f4160 100644
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -1,33 +1,27 @@
 import unittest
-from paddle.v2.fluid.layers import mul, data
+
+import numpy
 import paddle.v2.fluid.core as core
+
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
-import numpy
+from paddle.v2.fluid.layers import mul, data
 
 
 class TestExecutor(unittest.TestCase):
     def test_mul(self):
-        a = data(name='a', shape=[784], data_type='float32')
+        a = data(name='a', shape=[784], dtype='float32')
         b = data(
             name='b',
             shape=[784, 100],
-            data_type='float32',
+            dtype='float32',
             append_batch_size=False)
         out = mul(x=a, y=b)
         place = core.CPUPlace()
         a_np = numpy.random.random((100, 784)).astype('float32')
-        tensor_a = core.LoDTensor()
-        tensor_a.set(a_np, place)
         b_np = numpy.random.random((784, 100)).astype('float32')
-        tensor_b = core.LoDTensor()
-        tensor_b.set(b_np, place)
         exe = Executor(place)
-        outs = exe.run(g_main_program,
-                       feed={'a': tensor_a,
-                             'b': tensor_b},
-                       fetch_list=[out])
-        out = numpy.array(outs[0])
+        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
+        out = outs[0]
         self.assertEqual((100, 100), out.shape)
         self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
 
diff --git a/python/paddle/v2/fluid/tests/test_ftrl_op.py b/python/paddle/v2/fluid/tests/test_ftrl_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f77ac4659a9b877829f7ae52dd005d9dd11dac07
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_ftrl_op.py
@@ -0,0 +1,62 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFTRLOp(OpTest):
+    def setUp(self):
+        self.op_type = "ftrl"
+        w = np.random.random((102, 105)).astype("float32")
+        g = np.random.random((102, 105)).astype("float32")
+        sq_accum = np.full((102, 105), 0.1).astype("float32")
+        linear_accum = np.full((102, 105), 0.1).astype("float32")
+        lr = np.array([0.01]).astype("float32")
+        l1 = 0.1
+        l2 = 0.2
+        lr_power = -0.5
+
+        self.inputs = {
+            'Param': w,
+            'SquaredAccumulator': sq_accum,
+            'LinearAccumulator': linear_accum,
+            'Grad': g,
+            'LearningRate': lr
+        }
+        self.attrs = {
+            'l1': l1,
+            'l2': l2,
+            'lr_power': lr_power,
+            'learning_rate': lr
+        }
+        new_accum = sq_accum + g * g
+        if lr_power == -0.5:
+            linear_out = linear_accum + g - (
+                (np.sqrt(new_accum) - np.sqrt(sq_accum)) / lr) * w
+        else:
+            linear_out = linear_accum + g - ((np.power(
+                new_accum, -lr_power) - np.power(sq_accum, -lr_power)) / lr) * w
+
+        x = (l1 * np.sign(linear_out) - linear_out)
+        if lr_power == -0.5:
+            y = (np.sqrt(new_accum) / lr) + (2 * l2)
+            pre_shrink = x / y
+            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
+        else:
+            y = (np.power(new_accum, -lr_power) / lr) + (2 * l2)
+            pre_shrink = x / y
+            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
+
+        sq_accum_out = sq_accum + g * g
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'SquaredAccumOut': sq_accum_out,
+            'LinearAccumOut': linear_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gru_op.py b/python/paddle/v2/fluid/tests/test_gru_op.py
index b2474cff94c6c71cc62bc8e69a5d83e38d51c511..fa2c5a53ec4a01b6545e25f773c11277a4d24706 100644
--- a/python/paddle/v2/fluid/tests/test_gru_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
@@ -6,7 +6,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu
 
 
 class TestGRUOp(OpTest):
-    batch_size = 9
+    lod = [[0, 2, 6, 9]]
+    batch_size = lod[0][-1]
     frame_size = 5
     activate = {
         'identity': identity,
@@ -35,7 +36,7 @@ class TestGRUOp(OpTest):
                            seq_starts[sorted_seqs[i]] + batch_idx)
                 idx_in_seq.append(idx)
             idx_in_seq_list.append(idx_in_seq)
-        return idx_in_seq_list
+        return idx_in_seq_list, sorted_seqs
 
     def gru_step(self, x, h_p, w, b):
         batch_size = x.shape[0]
@@ -66,8 +67,8 @@ class TestGRUOp(OpTest):
         batch_hidden = self.outputs['BatchHidden']
         hidden = self.outputs['Hidden']
         idx_in_seq_list = self.idx_in_seq_list
-        h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
-            (len(idx_in_seq_list[0]), self.frame_size))
+        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
+            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
         num_batch = len(idx_in_seq_list)
         end_idx = 0
         for batch_idx in range(num_batch):
@@ -84,8 +85,9 @@ class TestGRUOp(OpTest):
         return batch_gate, batch_reset_hidden_prev, hidden
 
     def set_data(self):
-        lod = [[0, 2, 6, self.batch_size]]
-        self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
+        lod = self.lod
+        self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch(
+            lod, self.is_reverse)
         batch_size = self.batch_size
         frame_size = self.frame_size
         input = np.random.rand(batch_size, frame_size * 3).astype('float64')
@@ -146,7 +148,7 @@ class TestGRUOpReverse(TestGRUOp):
     def set_confs(self):
         self.is_reverse = True
         self.attrs = {
-            'activation': 'identity',
+            'activation': 'tanh',
             'gate_activation': 'sigmoid',
             'is_reverse': self.is_reverse
         }
diff --git a/python/paddle/v2/fluid/tests/test_gru_unit_op.py b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
index f356f6e9ec0da2d3e1fb67638d81e8d54c544f53..501d5aa5797d6def708338692f0861657f951ef7 100644
--- a/python/paddle/v2/fluid/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
@@ -28,8 +28,8 @@ def relu(x):
 
 
 class TestGRUUnitOp(OpTest):
-    batch_size = 3
-    frame_size = 5
+    batch_size = 5
+    frame_size = 10
     activate = {
         GRUActivationType.identity: identity,
         GRUActivationType.sigmoid: sigmoid,
@@ -77,7 +77,7 @@ class TestGRUUnitOp(OpTest):
         c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
                                                     g[:, frame_size * 2:])
         g = np.hstack((u_r, c))
-        h = u * h_p + (1 - u) * c
+        h = u * c + (1 - u) * h_p
         self.outputs = {
             'Gate': g.astype('float64'),
             'ResetHiddenPrev': r_h_p.astype('float64'),
@@ -92,10 +92,7 @@ class TestGRUUnitOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight'],
-            ['Hidden', 'ResetHiddenPrev', 'Gate'],
-            max_relative_error=0.007)
+        self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden'])
 
 
 class TestGRUUnitOpWithBias(TestGRUUnitOp):
@@ -104,18 +101,20 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
         frame_size = self.frame_size
         super(TestGRUUnitOpWithBias, self).set_inputs()
         self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float32')
+            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
         self.attrs = {
             'activation': GRUActivationType.identity,
             'gate_activation': GRUActivationType.sigmoid
         }
 
     def test_check_grad(self):
+        self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
+
+    def test_check_grad_ingore_input(self):
         self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
-            max_relative_error=0.007)
+            ['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
+            no_grad_set=set('Input'))
 
 
 if __name__ == '__main__':
-    exit(0)  # FIXME(yuyang18): This unittest is not pass. Fix it later
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
index bf5444107fa1609e67b09823b82e5fb92234b0a4..2fd609d4474e97ecd96adcd146f2f550e0772740 100644
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -1,6 +1,6 @@
 import unittest
 
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
 
@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase):
     def test_batch_norm_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
-            data_type='float32',
+            dtype='float32',
             main_program=main_program)
-        layers.batch_norm(
+        hidden1 = fluid.layers.batch_norm(
             input=images,
             main_program=main_program,
             startup_program=startup_program)
+        hidden2 = fluid.layers.fc(input=hidden1,
+                                  size=128,
+                                  act='relu',
+                                  main_program=main_program)
+        hidden3 = fluid.layers.batch_norm(
+            input=hidden2,
+            main_program=main_program,
+            startup_program=startup_program)
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_dropout_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
-            data_type='float32',
+            dtype='float32',
             main_program=main_program)
-        layers.dropout(
+        fluid.layers.dropout(
             x=images,
             dropout_prob=0.5,
             main_program=main_program,
@@ -61,10 +69,10 @@ class TestLayer(unittest.TestCase):
         main_program = Program()
         startup_program = Program()
 
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
-            data_type='float32',
+            dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
         conv1 = conv_block(images, 64, 2, [0.3, 0], main_program,
@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase):
     def test_elementwise_add_with_act(self):
         main_program = Program()
         startup_program = Program()
-        image1 = layers.data(
+        image1 = fluid.layers.data(
             name='pixel1',
             shape=[3, 48, 48],
-            data_type='float32',
+            dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
-        image2 = layers.data(
+        image2 = fluid.layers.data(
             name='pixel2',
             shape=[3, 48, 48],
-            data_type='float32',
+            dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
-        out = layers.elementwise_add(
+        out = fluid.layers.elementwise_add(
             x=image1,
             y=image2,
             act='relu',
diff --git a/python/paddle/v2/fluid/tests/test_inference_model_io.py b/python/paddle/v2/fluid/tests/test_inference_model_io.py
index 98b95713b73e8eba93bd6a58eaaed603cfae7952..60aed62ead83dedbeb9438c431ec292558d88ce5 100644
--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -1,13 +1,13 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
+import unittest
+
+import numpy as np
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 
+import paddle.v2.fluid.executor as executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.optimizer as optimizer
 from paddle.v2.fluid.framework import Program
 from paddle.v2.fluid.io import save_inference_model, load_inference_model
-import paddle.v2.fluid.executor as executor
-import unittest
-import numpy as np
 
 
 class TestBook(unittest.TestCase):
@@ -19,13 +19,13 @@ class TestBook(unittest.TestCase):
         x = layers.data(
             name='x',
             shape=[2],
-            data_type='float32',
+            dtype='float32',
             main_program=program,
             startup_program=init_program)
         y = layers.data(
             name='y',
             shape=[1],
-            data_type='float32',
+            dtype='float32',
             main_program=program,
             startup_program=init_program)
 
@@ -44,7 +44,7 @@ class TestBook(unittest.TestCase):
             x=cost, main_program=program, startup_program=init_program)
 
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-        opts = sgd_optimizer.minimize(avg_cost, init_program)
+        sgd_optimizer.minimize(avg_cost, init_program)
 
         place = core.CPUPlace()
         exe = executor.Executor(place)
@@ -52,25 +52,20 @@ class TestBook(unittest.TestCase):
         exe.run(init_program, feed={}, fetch_list=[])
 
         for i in xrange(100):
-            x_data = np.array(
+            tensor_x = np.array(
                 [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
-            y_data = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
+            tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
 
-            tensor_x = core.LoDTensor()
-            tensor_x.set(x_data, place)
-            tensor_y = core.LoDTensor()
-            tensor_y.set(y_data, place)
             exe.run(program,
                     feed={'x': tensor_x,
                           'y': tensor_y},
                     fetch_list=[avg_cost])
 
         save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
-        outs = exe.run(program,
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost])
-        expected = np.array(outs[0])
+        expected = exe.run(program,
+                           feed={'x': tensor_x,
+                                 'y': tensor_y},
+                           fetch_list=[avg_cost])[0]
 
         reload(executor)  # reload to build a new scope
         exe = executor.Executor(place)
@@ -83,7 +78,7 @@ class TestBook(unittest.TestCase):
             feed={feed_var_names[0]: tensor_x,
                   feed_var_names[1]: tensor_y},
             fetch_list=fetch_vars)
-        actual = np.array(outs[0])
+        actual = outs[0]
 
         self.assertEqual(feed_var_names, ["x", "y"])
         self.assertEqual(len(fetch_vars), 1)
diff --git a/python/paddle/v2/fluid/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py
index f2eb79b209627f5814847db6d96c0a17300d9b5a..6c20203f8eca02b3f68ed2aa8664bed29551c070 100644
--- a/python/paddle/v2/fluid/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
@@ -223,5 +223,109 @@ class TestXavierInitializer(unittest.TestCase):
         self.assertEqual(init_op.attr('seed'), 134)
 
 
+class TestMSRAInitializer(unittest.TestCase):
+    def test_uniform_msra_initializer(self):
+        """Test MSRA initializer with uniform distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / param.shape[0])
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_uniform_msra_initializer_conv(self):
+        """Test MSRA initializer with uniform distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        receptive_field_size = float(15 * 20)
+        limit = np.sqrt(6.0 / (param.shape[1] * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer(self):
+        """Test MSRA initializer with normal distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        std = np.sqrt(2.0 / param.shape[0])
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer_conv(self):
+        """Test MSRA initializer with normal distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        receptive_field_size = float(15 * 20)
+        std = np.sqrt(2.0 / (param.shape[1] * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_msra_initializer_supplied_arguments(self):
+        """Test the MSRA initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(
+                fan_in=12, seed=134))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / 12)
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 134)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_is_empty_op.py
rename to python/paddle/v2/fluid/tests/test_is_empty_op.py
index 129d1c19447990fb0affa8fb10bc7156ec5c8cc3..ed6e3fe24f6333c9c90d760787eb13241a7e1868 100644
--- a/python/paddle/v2/framework/tests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -1,7 +1,7 @@
 import unittest
 import numpy as np
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 
 
 def create_tensor(scope, name, np_data):
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 3d18e7ce3a4dc6c6b917a1000de39fca71f6ac18..b6906be60b8ffb7c7afc220ad4f40c6f60a0b112 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,25 +1,26 @@
+import unittest
+
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
-import paddle.v2.fluid.core as core
-import unittest
 
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
         program = Program()
         x = layers.data(
-            name='x', shape=[13], data_type='float32', main_program=program)
+            name='x', shape=[13], dtype='float32', main_program=program)
         y_predict = layers.fc(input=x, size=1, act=None, main_program=program)
 
         y = layers.data(
-            name='y', shape=[1], data_type='float32', main_program=program)
+            name='y', shape=[1], dtype='float32', main_program=program)
         cost = layers.square_error_cost(
             input=y_predict, label=y, main_program=program)
 
         avg_cost = layers.mean(x=cost, main_program=program)
         self.assertIsNotNone(avg_cost)
         program.append_backward(avg_cost)
+
         print str(program)
 
     def test_recognize_digits_mlp(self):
@@ -27,12 +28,9 @@ class TestBook(unittest.TestCase):
 
         # Change g_program, so the rest layers use `g_program`
         images = layers.data(
-            name='pixel',
-            shape=[784],
-            data_type='float32',
-            main_program=program)
+            name='pixel', shape=[784], dtype='float32', main_program=program)
         label = layers.data(
-            name='label', shape=[1], data_type='int32', main_program=program)
+            name='label', shape=[1], dtype='int32', main_program=program)
         hidden1 = layers.fc(input=images,
                             size=128,
                             act='relu',
@@ -49,6 +47,7 @@ class TestBook(unittest.TestCase):
             input=predict, label=label, main_program=program)
         avg_cost = layers.mean(x=cost, main_program=program)
         self.assertIsNotNone(avg_cost)
+
         print str(program)
 
     def test_simple_conv2d(self):
@@ -56,7 +55,7 @@ class TestBook(unittest.TestCase):
         images = layers.data(
             name='pixel',
             shape=[3, 48, 48],
-            data_type='int32',
+            dtype='int32',
             main_program=program)
         layers.conv2d(
             input=images,
@@ -66,16 +65,25 @@ class TestBook(unittest.TestCase):
 
         print str(program)
 
+    def test_conv2d_transpose(self):
+        program = Program()
+        kwargs = {'main_program': program}
+        img = layers.data(
+            name='pixel', shape=[3, 2, 2], dtype='float32', **kwargs)
+        layers.conv2d_transpose(
+            input=img, num_filters=10, output_size=28, **kwargs)
+        print str(program)
+
     def test_recognize_digits_conv(self):
         program = Program()
 
         images = layers.data(
             name='pixel',
             shape=[1, 28, 28],
-            data_type='float32',
+            dtype='float32',
             main_program=program)
         label = layers.data(
-            name='label', shape=[1], data_type='int32', main_program=program)
+            name='label', shape=[1], dtype='int32', main_program=program)
         conv_pool_1 = nets.simple_img_conv_pool(
             input=images,
             filter_size=5,
@@ -110,40 +118,40 @@ class TestBook(unittest.TestCase):
         dict_size = 10000
         embed_size = 32
         first_word = layers.data(
-            name='firstw', shape=[1], data_type='int64', main_program=program)
+            name='firstw', shape=[1], dtype='int64', main_program=program)
         second_word = layers.data(
-            name='secondw', shape=[1], data_type='int64', main_program=program)
+            name='secondw', shape=[1], dtype='int64', main_program=program)
         third_word = layers.data(
-            name='thirdw', shape=[1], data_type='int64', main_program=program)
+            name='thirdw', shape=[1], dtype='int64', main_program=program)
         forth_word = layers.data(
-            name='forthw', shape=[1], data_type='int64', main_program=program)
+            name='forthw', shape=[1], dtype='int64', main_program=program)
         next_word = layers.data(
-            name='nextw', shape=[1], data_type='int64', main_program=program)
+            name='nextw', shape=[1], dtype='int64', main_program=program)
 
         embed_first = layers.embedding(
             input=first_word,
             size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
+            dtype='float32',
+            param_attr='shared_w',
             main_program=program)
         embed_second = layers.embedding(
             input=second_word,
             size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
+            dtype='float32',
+            param_attr='shared_w',
             main_program=program)
 
         embed_third = layers.embedding(
             input=third_word,
             size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
+            dtype='float32',
+            param_attr='shared_w',
             main_program=program)
         embed_forth = layers.embedding(
             input=forth_word,
             size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
+            dtype='float32',
+            param_attr='shared_w',
             main_program=program)
 
         concat_embed = layers.concat(
@@ -166,6 +174,20 @@ class TestBook(unittest.TestCase):
 
         print str(program)
 
+    def test_linear_chain_crf(self):
+        program = Program()
+
+        # Change g_program, so the rest layers use `g_program`
+        images = layers.data(
+            name='pixel', shape=[784], dtype='float32', main_program=program)
+        label = layers.data(
+            name='label', shape=[1], dtype='int32', main_program=program)
+        hidden = layers.fc(input=images, size=128, main_program=program)
+        crf = layers.linear_chain_crf(
+            input=hidden, label=label, main_program=program)
+
+        print str(program)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
index 6f06a66c825b37ee91214efc0a29a58f0b9057f9..c26634ff20c46e484d600c758be386ec8327d1c1 100644
--- a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
@@ -104,7 +104,7 @@ class TestLinearChainCrfOp(OpTest):
         transition_exps = np.exp(transition)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),
diff --git a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
index a01ae83772185df218b8c453557dc0cac719673b..8a4be545eda841dbda33b7c8cae9f91a4199f2f8 100644
--- a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
@@ -13,7 +13,7 @@ class TestLoDArrayLength(unittest.TestCase):
         arr_len = layers.array_length(arr)
         cpu = core.CPUPlace()
         exe = Executor(cpu)
-        result = numpy.array(exe.run(fetch_list=[arr_len])[0])
+        result = exe.run(fetch_list=[arr_len])[0]
         self.assertEqual(11, result[0])
 
 
diff --git a/python/paddle/v2/fluid/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
index bbc11930b9e804c2769cc590c298c6e90dc36ca6..30d619fe318517345195281b17f88e9916b6afb3 100644
--- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -1,6 +1,5 @@
 from paddle.v2.fluid.layers import lod_rank_table, data
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
 import paddle.v2.fluid.core as core
 import numpy
 import unittest
@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
         tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
-        exe.run(g_main_program, scope=scope, feed={'x': tensor})
+        exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
         self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
index b18cb6b49fa41f26e1b6de1128690507c5a2f099..0a916a55bc3d097e17fb504b0d6b2f2818f030c9 100644
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -18,7 +18,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set_lod([[0, 3, 9, 10]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
-        self.main(tensor=tensor, expect_array=expect, expect_lod=[] * 6)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
 
     def test_lod_tensor_to_array_level_0_empty_seq(self):
         tensor = core.LoDTensor()
@@ -27,7 +31,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set_lod([[0, 3, 9, 9, 10]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
-        self.main(tensor=tensor, expect_array=expect, expect_lod=[] * 6)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
 
     def test_lod_tensor_to_array_level_1(self):
         tensor = core.LoDTensor()
@@ -44,7 +52,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
 
         lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
 
     def test_lod_tensor_to_array_level_1_empty_seq(self):
         tensor = core.LoDTensor()
@@ -63,7 +75,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
 
         lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=4)
 
     def test_lod_tensor_to_array_level_2(self):
         tensor = core.LoDTensor()
@@ -80,7 +96,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
         lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
                [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
 
     def test_lod_tensor_to_array_level_2_skip_level(self):
         tensor = core.LoDTensor()
@@ -88,14 +108,21 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
         tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
                         [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
-        self.main(tensor=tensor, expect_array=None, expect_lod=None, level=1)
-
-    def main(self, tensor, expect_array, expect_lod, level=0):
+        self.main(
+            tensor=tensor,
+            expect_array=None,
+            expect_lod=None,
+            expect_max_len=4,
+            level=1)
+
+    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
         place = self.place()
         program = Program()
         x = layers.data(name='x', shape=[10], main_program=program)
         x.persistable = True
         table = layers.lod_rank_table(x, level=level, main_program=program)
+        max_len = layers.max_sequence_len(table, main_program=program)
+        max_len.persistable = True
         array = layers.lod_tensor_to_array(x, table, main_program=program)
         array.persistable = True
 
@@ -110,6 +137,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             self.check_array_same(array, expect_array, expect_lod)
         self.check_tensor_same(scope.find_var(result.name).get_tensor(), tensor)
 
+        self.assertEqual(
+            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
+            expect_max_len)
+
     def check_array_same(self, array, expect_tensor, expect_lod):
         self.assertEqual(len(expect_tensor), len(array))
         for i, exp in enumerate(zip(expect_tensor, expect_lod)):
@@ -132,7 +163,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
         x = layers.data(
             name='x',
             shape=[1],
-            data_type='float32',
+            dtype='float32',
             main_program=program,
             stop_gradient=False)
         table = layers.lod_rank_table(x, level=0, main_program=program)
@@ -151,10 +182,11 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
 
         exe = Executor(place)
         g_out = [
-            item.sum()
-            for item in map(
-                numpy.array,
-                exe.run(program, feed={'x': tensor}, fetch_list=[g_vars]))
+            numpy.array(item).sum()
+            for item in exe.run(program,
+                                feed={'x': tensor},
+                                fetch_list=[g_vars],
+                                return_numpy=False)
         ]
         g_out_sum = numpy.array(g_out).sum()
 
diff --git a/python/paddle/v2/fluid/tests/test_log_loss_op.py b/python/paddle/v2/fluid/tests/test_log_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eeaa90758c57ef0d92a8ad7b0a4c1b1f2c38be3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
@@ -0,0 +1,33 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 32
+
+        predicted = np.random.uniform(0.1, 1.0,
+                                      (samples_num, 1)).astype("float32")
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-4
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_logical_op.py b/python/paddle/v2/fluid/tests/test_logical_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac90bf839cb96053387bb82c112692136707744c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_logical_op.py
@@ -0,0 +1,35 @@
+import op_test
+import unittest
+import numpy as np
+
+
+def create_test_class(op_type, callback, binary_op=True):
+    class Cls(op_test.OpTest):
+        def setUp(self):
+            a = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
+            if binary_op:
+                b = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
+                c = callback(a, b)
+            else:
+                c = callback(a)
+            self.outputs = {'Out': c}
+            self.op_type = op_type
+            if binary_op:
+                self.inputs = {'X': a, 'Y': b}
+            else:
+                self.inputs = {'X': a}
+
+        def test_output(self):
+            self.check_output()
+
+    Cls.__name__ = op_type
+    globals()[op_type] = Cls
+
+
+create_test_class('logical_and', lambda _a, _b: np.logical_and(_a, _b))
+create_test_class('logical_or', lambda _a, _b: np.logical_or(_a, _b))
+create_test_class('logical_not', lambda _a: np.logical_not(_a), False)
+create_test_class('logical_xor', lambda _a, _b: np.logical_xor(_a, _b))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_maxout_op.py b/python/paddle/v2/fluid/tests/test_maxout_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbed43e254b811d38e441e946a73c24f87373de
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
@@ -0,0 +1,37 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def maxout_forward_naive(input, groups):
+    s0, s1, s2, s3 = input.shape
+    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
+        buffer = input, dtype=input.dtype).max(axis=(2))
+
+
+class TestMaxOutOp(OpTest):
+    def setUp(self):
+        self.op_type = "maxout"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.MaxOut_forward_naive(input, self.groups).astype("float32")
+
+        self.inputs = {'X': input}
+        self.attrs = {'groups': self.groups}
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.MaxOut_forward_naive = maxout_forward_naive
+        self.shape = [100, 6, 2, 2]
+        self.groups = 2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..50fcc4a72ddbd6d7a3d3b73434c6ac8de5a006e2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
@@ -0,0 +1,138 @@
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import MomentumOptimizer
+import paddle.v2.fluid.core as core
+import paddle.v2 as paddle
+import unittest
+import numpy as np
+
+
+class TestMNISTIfElseOp(unittest.TestCase):
+    def test_raw_api(self):
+        kwargs = {'startup_program': Program(), 'main_program': Program()}
+        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+
+        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+
+        limit = layers.fill_constant_batch_size_like(
+            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
+
+        cond = layers.less_than(x=label, y=limit, **kwargs)
+        true_image, false_image = layers.split_lod_tensor(
+            input=image, mask=cond, **kwargs)
+
+        true_out = layers.create_tensor(dtype='float32', **kwargs)
+        true_cond = layers.ConditionalBlock([true_image], **kwargs)
+
+        with true_cond.block():
+            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
+            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+            layers.assign(input=prob, output=true_out, **kwargs)
+
+        false_out = layers.create_tensor(dtype='float32', **kwargs)
+        false_cond = layers.ConditionalBlock([false_image], **kwargs)
+
+        with false_cond.block():
+            hidden = layers.fc(input=false_image,
+                               size=200,
+                               act='tanh',
+                               **kwargs)
+            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+            layers.assign(input=prob, output=false_out, **kwargs)
+
+        prob = layers.merge_lod_tensor(
+            in_true=true_out, in_false=false_out, mask=cond, x=image, **kwargs)
+        loss = layers.cross_entropy(input=prob, label=label, **kwargs)
+        avg_loss = layers.mean(x=loss, **kwargs)
+
+        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        optimizer.minimize(avg_loss, kwargs['startup_program'])
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=200)
+
+        place = core.CPUPlace()
+        exe = Executor(place)
+
+        exe.run(kwargs['startup_program'])
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = np.expand_dims(y_data, axis=1)
+
+                outs = exe.run(kwargs['main_program'],
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
+                print outs[0]
+                if outs[0] < 1.0:
+                    return
+        self.assertFalse(True)
+
+    def test_ifelse(self):
+        kwargs = {'startup_program': Program(), 'main_program': Program()}
+        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+
+        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+
+        limit = layers.fill_constant_batch_size_like(
+            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
+
+        cond = layers.less_than(x=label, y=limit, **kwargs)
+
+        ie = layers.IfElse(cond, **kwargs)
+
+        with ie.true_block():
+            true_image = ie.input(image)
+            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
+            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+            ie.output(prob)
+
+        with ie.false_block():
+            false_image = ie.input(image)
+            hidden = layers.fc(input=false_image,
+                               size=200,
+                               act='tanh',
+                               **kwargs)
+            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+            ie.output(prob)
+
+        prob = ie()
+        loss = layers.cross_entropy(input=prob[0], label=label, **kwargs)
+        avg_loss = layers.mean(x=loss, **kwargs)
+
+        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        optimizer.minimize(avg_loss, kwargs['startup_program'])
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=200)
+
+        place = core.CPUPlace()
+        exe = Executor(place)
+
+        exe.run(kwargs['startup_program'])
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape((y_data.shape[0], 1))
+
+                outs = exe.run(kwargs['main_program'],
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
+                print outs[0]
+                if outs[0] < 1.0:
+                    return
+        self.assertFalse(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_nccl_init_op.py b/python/paddle/v2/fluid/tests/test_nccl_init_op.py
deleted file mode 100644
index a536800ccd81fdc2f3b7c8320cede4f8ecf3a8cb..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_nccl_init_op.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import unittest, os
-import numpy as np
-import paddle.v2 as paddle
-from paddle.v2.fluid.op import Operator
-import paddle.v2.fluid.core as core
-from op_test import OpTest, create_op, set_input
-
-if not core.is_compile_gpu():
-    exit(0)
-
-gpu_count = core.get_cuda_device_count()
-
-if gpu_count <= 1:
-    exit(0)
-
-g_scope = core.Scope()
-g_ctx = core.DeviceContext.create(core.CPUPlace())
-
-
-class TestNCCLInit(unittest.TestCase):
-    def test_init(self):
-        self.op_type = "ncclInit"
-        self.gpus = range(gpu_count)
-
-        self.inputs = {}
-        self.attrs = {"gpus": self.gpus}
-        g_scope.var("Communicator").get_communicator()
-        self.outputs = {"Communicator": g_scope.find_var("Communicator")}
-        nccl_init = create_op(
-            g_scope,
-            op_type=self.op_type,
-            inputs=self.inputs,
-            outputs=self.outputs,
-            attrs=self.attrs)
-        nccl_init.run(g_scope, g_ctx)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
index e8362d2e9c6038c04c24dce35de8c53bfde78142..ce34d95ac8cb2644dee9c551cd8e85b33609919a 100644
--- a/python/paddle/v2/fluid/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -1,11 +1,15 @@
 import unittest
-from paddle.v2.fluid.framework import Variable, Program, g_main_program
+
 import paddle.v2.fluid.core as core
 
+from paddle.v2.fluid.framework import Program, default_startup_program
+
+main_program = default_startup_program()
+
 
 class TestOperator(unittest.TestCase):
     def test_error_type(self):
-        block = g_main_program.create_block()
+        block = main_program.create_block()
         try:
             block.append_op()
             self.assertFail()
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 7b4237e7fdf5990019ddd85967036ceb598c33df..2459dfd664300d405edb36c4ca906c1769b5e7d2 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -16,14 +16,18 @@ class TestOptimizer(unittest.TestCase):
             dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
         mul_out = block.create_var(
             dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mul",
             inputs={"X": mul_x,
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
-        opts = sgd_optimizer.minimize(mul_out, init_program)
+        opts = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -44,12 +48,16 @@ class TestOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         global_step = block.create_var(
             dtype="float32", shape=[1], lod_level=0, name="step")
         learning_rate = 0.01
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
-        opts = sgd_optimizer.minimize(mul_out, init_program)
+        opts = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 2)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -90,7 +98,11 @@ class TestMomentumOptimizer(unittest.TestCase):
         learning_rate = 0.01
         momentum_optimizer = self.MockMomentum(
             learning_rate=learning_rate, momentum=0.2)
-        params_grads = append_backward_ops(mul_out)
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -132,10 +144,14 @@ class TestMomentumOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         momentum_optimizer = self.MockMomentum(
             learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -186,10 +202,14 @@ class TestAdagradOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         adagrad_optimizer = self.MockAdagrad(
             learning_rate=learning_rate, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -242,10 +262,14 @@ class TestAdamOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         adam_optimizer = self.MockAdam(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -300,10 +324,14 @@ class TestAdamaxOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         adamax_optimizer = self.MockAdamax(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -355,10 +383,14 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         decayed_adagrad_optimizer = self.MockDecayedAdagrad(
             learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
         opts = decayed_adagrad_optimizer.create_optimization_pass(
diff --git a/python/paddle/v2/fluid/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
index a633d22c2b1db2728b6eb767078ce4aec6cce163..694344acbbd3b7c80cb0ff48ada843f794061282 100644
--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -1,17 +1,19 @@
 import unittest
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.io as io
 from paddle.v2.fluid.initializer import ConstantInitializer
 import numpy as np
 
+main_program = default_main_program()
+
 
 class TestParameter(unittest.TestCase):
     def test_param(self):
         shape = [784, 100]
         val = 1.0625
-        b = g_main_program.global_block()
+        b = main_program.global_block()
         param = b.create_parameter(
             name='fc.w',
             shape=shape,
@@ -20,12 +22,12 @@ class TestParameter(unittest.TestCase):
         self.assertIsNotNone(param)
         self.assertEqual('fc.w', param.name)
         self.assertEqual((784, 100), param.shape)
-        self.assertEqual(core.DataType.FP32, param.data_type)
+        self.assertEqual(core.DataType.FP32, param.dtype)
         self.assertEqual(0, param.block.idx)
         exe = Executor(core.CPUPlace())
-        p = exe.run(g_main_program, fetch_list=[param])[0]
-        self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
-        p = io.get_parameter_value_by_name('fc.w', exe, g_main_program)
+        p = exe.run(main_program, fetch_list=[param])[0]
+        self.assertTrue(np.allclose(p, np.ones(shape) * val))
+        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
         self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
 
 
diff --git a/python/paddle/v2/fluid/tests/test_pool2d_op.py b/python/paddle/v2/fluid/tests/test_pool2d_op.py
index ac3fa6aa87835b3cd6fb9bbf6fe66b1d0c577ca2..5dff6270f455395ce6ca8ae2428236f630467095 100644
--- a/python/paddle/v2/fluid/tests/test_pool2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool2d_op.py
@@ -3,8 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
-
+def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
@@ -23,8 +22,7 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
     return out
 
 
-def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
-
+def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
@@ -47,6 +45,7 @@ def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
 class TestPool2d_Op(OpTest):
     def setUp(self):
         self.init_test_case()
+        self.init_global_pool()
         self.init_op_type()
         self.init_pool_type()
         if self.global_pool:
@@ -75,8 +74,6 @@ class TestPool2d_Op(OpTest):
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
     def init_test_case(self):
-        self.global_pool = True
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
         self.shape = [2, 3, 5, 5]
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -87,12 +84,14 @@ class TestPool2d_Op(OpTest):
 
     def init_pool_type(self):
         self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = True
 
 
 class TestCase1(TestPool2d_Op):
     def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -103,12 +102,14 @@ class TestCase1(TestPool2d_Op):
 
     def init_pool_type(self):
         self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
 
 
 class TestCase2(TestPool2d_Op):
     def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -119,152 +120,69 @@ class TestCase2(TestPool2d_Op):
 
     def init_pool_type(self):
         self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
 
+    def init_global_pool(self):
+        self.global_pool = False
 
-class TestCase3(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+class TestCase3(TestPool2d_Op):
     def init_op_type(self):
         self.op_type = "pool2d"
 
     def init_pool_type(self):
         self.pool_type = "max"
-
-
-class TestCase4(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
         self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+
+class TestCase4(TestCase1):
     def init_op_type(self):
         self.op_type = "pool2d"
 
     def init_pool_type(self):
         self.pool_type = "max"
-
-
-class TestCase5(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
         self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
 
+
+class TestCase5(TestCase2):
     def init_op_type(self):
         self.op_type = "pool2d"
 
     def init_pool_type(self):
         self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 #--------------------test pool2d_cudnn--------------------
-class TestCaseCudnn1(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
-
+class TestCudnnCase1(TestPool2d_Op):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "avg"
-
-
-class TestCaseCudnn2(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+class TestCudnnCase2(TestCase1):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "avg"
-
-
-class TestCaseCudnn3(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
 
+class TestCudnnCase3(TestCase2):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "avg"
-
-
-class TestCaseCudnn4(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+class TestCudnnCase4(TestCase3):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "max"
-
-
-class TestCaseCudnn5(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+class TestCudnnCase5(TestCase4):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "max"
-
-
-class TestCaseCudnn6(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
 
+class TestCudnnCase6(TestCase5):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "max"
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_pool3d_op.py b/python/paddle/v2/fluid/tests/test_pool3d_op.py
index 87483ae5e568c01141ff789f37e84069cb8e827d..2ba86665a7d207e61159c02643fa40daca3be080 100644
--- a/python/paddle/v2/fluid/tests/test_pool3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool3d_op.py
@@ -3,8 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
-
+def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
@@ -27,8 +26,7 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
     return out
 
 
-def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
-
+def avg_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
@@ -55,6 +53,10 @@ def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
 class TestPool3d_Op(OpTest):
     def setUp(self):
         self.init_test_case()
+        self.init_global_pool()
+        self.init_op_type()
+        self.init_pool_type()
+
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype("float32")
@@ -81,74 +83,115 @@ class TestPool3d_Op(OpTest):
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
     def init_test_case(self):
-        self.global_pool = True
-        self.op_type = "pool3d"
-        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
         self.shape = [2, 3, 5, 5, 5]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = True
+
 
 class TestCase1(TestPool3d_Op):
     def init_test_case(self):
-        self.global_pool = False
         self.op_type = "pool3d"
-        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
         self.shape = [2, 3, 7, 7, 7]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
-
-class TestCase2(TestPool3d_Op):
-    def init_test_case(self):
-        self.global_pool = False
+    def init_op_type(self):
         self.op_type = "pool3d"
+
+    def init_pool_type(self):
         self.pool_type = "avg"
         self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase2(TestPool3d_Op):
+    def init_test_case(self):
         self.shape = [2, 3, 7, 7, 7]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [1, 1, 1]
 
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
 
 class TestCase3(TestPool3d_Op):
-    def init_test_case(self):
-        self.global_pool = True
+    def init_op_type(self):
         self.op_type = "pool3d"
+
+    def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [0, 0, 0]
 
 
-class TestCase4(TestPool3d_Op):
-    def init_test_case(self):
-        self.global_pool = False
+class TestCase4(TestCase1):
+    def init_op_type(self):
         self.op_type = "pool3d"
+
+    def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [0, 0, 0]
 
 
-class TestCase5(TestPool3d_Op):
-    def init_test_case(self):
-        self.global_pool = False
+class TestCase5(TestCase2):
+    def init_op_type(self):
         self.op_type = "pool3d"
+
+    def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
+
+
+#--------------------test pool3d_cudnn--------------------
+class TestCudnnCase1(TestPool3d_Op):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase2(TestCase1):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase3(TestCase2):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase4(TestCase3):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase5(TestCase4):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase6(TestCase5):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_pool_max_op.py b/python/paddle/v2/fluid/tests/test_pool_max_op.py
index 04843a28ac19e076e097d1aa1034bcf9378aa495..9d2d61c43868701392e90542f3b7fb2c4ea07548 100644
--- a/python/paddle/v2/fluid/tests/test_pool_max_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool_max_op.py
@@ -3,11 +3,13 @@ import numpy as np
 from op_test import OpTest
 
 
-def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
 
     N, C, D, H, W = x.shape
-    if global_pool == 1:
+    if global_pool:
         ksize = [D, H, W]
+        paddings = [0, 0, 0]
+
     D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
     H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
     W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
@@ -40,11 +42,13 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     return out, mask
 
 
-def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
 
     N, C, H, W = x.shape
-    if global_pool == 1:
+    if global_pool:
         ksize = [H, W]
+        paddings = [0, 0]
+
     H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
     W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
@@ -74,13 +78,13 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
 class TestMaxPoolWithIndex_Op(OpTest):
     def setUp(self):
         self.init_test_case()
-        if self.global_pool:
-            self.paddings = [0 for _ in range(len(self.paddings))]
+        self.init_global()
+
         input = np.random.random(self.shape).astype("float32")
         output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
                                                self.paddings, self.global_pool)
         output = output.astype("float32")
-        mask = mask.astype("float32")
+        mask = mask.astype("int32")
 
         self.attrs = {
             'strides': self.strides,
@@ -99,41 +103,24 @@ class TestMaxPoolWithIndex_Op(OpTest):
     #     self.check_grad(set(['X']), ['Out'], max_relative_error=0.07)
 
     def init_test_case(self):
-        self.global_pool = True
-        self.index = "max_pool3d_with_index"
-        self.op_type = "%s" % self.index
+        self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
         self.shape = [2, 3, 5, 5, 5]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [1, 1, 1]
 
+    def init_global(self):
+        self.global_pool = False
+
 
 class TestCase1(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
         self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
 
 
 class TestCase2(TestMaxPoolWithIndex_Op):
     def init_test_case(self):
-        self.global_pool = False
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
-
-
-class TestCase3(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = False
         self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
         self.shape = [2, 3, 7, 7, 7]
@@ -141,32 +128,18 @@ class TestCase3(TestMaxPoolWithIndex_Op):
         self.strides = [2, 2, 2]
         self.paddings = [0, 0, 0]
 
-
-class TestCase4(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
         self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
 
 
-class TestCase5(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [2, 2, 2]
-        self.paddings = [0, 0, 0]
+class TestCase3(TestCase2):
+    def init_global(self):
+        self.global_pool = False
 
 
-class TestCase6(TestMaxPoolWithIndex_Op):
+#----------------max_pool2d_with_index----------------
+class TestCase4(TestMaxPoolWithIndex_Op):
     def init_test_case(self):
-        self.global_pool = False
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
         self.shape = [2, 3, 7, 7]
@@ -174,10 +147,17 @@ class TestCase6(TestMaxPoolWithIndex_Op):
         self.strides = [1, 1]
         self.paddings = [1, 1]
 
+    def init_global(self):
+        self.global_pool = True
+
 
-class TestCase7(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+class TestCase5(TestCase4):
+    def init_global(self):
         self.global_pool = False
+
+
+class TestCase6(TestMaxPoolWithIndex_Op):
+    def init_test_case(self):
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
         self.shape = [2, 3, 7, 7]
@@ -185,27 +165,13 @@ class TestCase7(TestMaxPoolWithIndex_Op):
         self.strides = [2, 2]
         self.paddings = [0, 0]
 
-
-class TestCase8(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
         self.global_pool = True
-        self.op_type = "max_pool2d_with_index"
-        self.pool_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
 
 
-class TestCase9(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.op_type = "max_pool2d_with_index"
-        self.pool_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [2, 2]
-        self.paddings = [0, 0]
+class TestCase7(TestCase6):
+    def init_global(self):
+        self.global_pool = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..395d0dc36a3d1d6fbfebb4cdf34395c4edee412d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -0,0 +1,28 @@
+import unittest
+import numpy as np
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+
+
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compile_gpu():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.GPUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
index ef2daf6916e14c015a39ae0193948e7ff6531449..1a9313c68aab165d85ae29051faeacb4927ac2c9 100644
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,36 +1,38 @@
+from __future__ import print_function
 import unittest
 
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import Program
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import Program, default_main_program
+import paddle.v2.fluid.layers as layers
+
+main_program = default_main_program()
 
 
 class TestProgram(unittest.TestCase):
     def test_program(self):
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(-1, b.parent_idx)
         self.assertEqual(0, b.idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(2, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
+        main_program.rollback()
 
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(3, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
-        b = g_main_program.current_block()
+        main_program.rollback()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
@@ -49,8 +51,8 @@ class TestProgram(unittest.TestCase):
 
         # FIXME(yuyang18): We manual compare the output string, since the order
         # of variable could be changed.
-        print prog
-        print prog.clone()
+        print(prog)
+        print(prog.clone())
 
     def test_parse_program_from_string(self):
         prog = Program()
@@ -68,8 +70,8 @@ class TestProgram(unittest.TestCase):
         binary_str = prog.desc.serialize_to_string()
         prog_restored = Program.parse_from_string(binary_str)
 
-        print prog
-        print prog_restored
+        print(prog)
+        print(prog_restored)
 
     def test_append_backward(self):
         prog = Program()
@@ -98,27 +100,46 @@ class TestProgram(unittest.TestCase):
                     "Y": add_y},
             outputs={"Out": add_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
 
         self.assertEqual(mul_op.idx, 0)
         self.assertEqual(add_op.idx, 1)
-        param_to_grad = prog.append_backward(add_out, set())
+        param_to_grad = prog.append_backward(mean_out, set())
 
         def grad_name(name):
             return name + "@GRAD"
 
-        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out"):
+        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
+                         "mean.out"):
             self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
             self.assertEqual(param_to_grad[var_name][1], 0)
 
         expect_ops = [
-            "mul", "elementwise_add", "fill_constant", "elementwise_add_grad",
-            "mul_grad"
+            "mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
+            "elementwise_add_grad", "mul_grad"
         ]
         actual_ops = []
         for op in block.ops:
             actual_ops.append(op.type)
         self.assertEqual(actual_ops, expect_ops)
 
+    def test_program_clone_with_parameter(self):
+        main_program = Program()
+        startup_program = Program()
+        kwargs = {
+            'main_program': main_program,
+            'startup_program': startup_program
+        }
+        d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        hidden = layers.fc(input=d, size=100, **kwargs)
+        layers.fc(input=hidden, size=100, **kwargs)
+
+        new_program = main_program.clone()
+        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 098a9802dfc6763ce2a2356b7267a439145b7939..d8abe17606c4ddb2ff51d5f918b1e5d7e110f7fa 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -101,13 +101,13 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(src_shape, res_shape)
         self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
-    def test_data_type(self):
+    def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
         var = block.var('my_var')
         var.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        var.set_data_type(core.DataType.INT32)
-        self.assertEqual(core.DataType.INT32, var.data_type())
+        var.set_dtype(core.DataType.INT32)
+        self.assertEqual(core.DataType.INT32, var.dtype())
         self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
 
diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
index b623d1231838faff9e91c9234befb1f647fe8ec2..36e0c84c0b8e7d40aa56d75c8904a38694881be4 100644
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -118,14 +118,14 @@ class RecurrentOpTest1(unittest.TestCase):
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
         x.stop_gradient = False
         h_boot = layers.data(
             shape=[self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot',
             **self.p_info)
         h_boot.stop_gradient = False
@@ -156,7 +156,7 @@ class RecurrentOpTest1(unittest.TestCase):
                       feed=self.feed_map,
                       fetch_list=[self.output])
 
-        return np.array(out[0])
+        return out[0]
 
     def backward(self):
         self.feed_map = {
@@ -171,7 +171,8 @@ class RecurrentOpTest1(unittest.TestCase):
         exe = Executor(self.place)
         return exe.run(self.main_program,
                        feed=self.feed_map,
-                       fetch_list=fetch_list)
+                       fetch_list=fetch_list,
+                       return_numpy=False)
 
     def test_backward(self):
         self.check_forward()
@@ -251,14 +252,14 @@ class RecurrentOpTest2(RecurrentOpTest1):
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
         x.stop_gradient = False
         h_boot = layers.data(
             shape=[self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot',
             **self.p_info)
         h_boot.stop_gradient = False
@@ -270,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1):
 
             temp_l = layers.fc(input=x_t,
                                size=self.input_dim,
-                               param_attr={'name': 'W'},
+                               param_attr='W',
                                bias_attr=False,
                                **self.p_info)
             temp_r = layers.fc(input=h_pre,
                                size=self.input_dim,
-                               param_attr={'name': 'U'},
+                               param_attr='U',
                                bias_attr=False,
                                **self.p_info)
 
@@ -350,21 +351,21 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
         x.stop_gradient = False
         h_boot1 = layers.data(
             shape=[self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot1',
             append_batch_size=False,
             **self.p_info)
         h_boot1.stop_gradient = False
         h_boot2 = layers.data(
             shape=[self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot2',
             append_batch_size=False,
             **self.p_info)
@@ -435,7 +436,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
diff --git a/python/paddle/v2/fluid/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py
index f5d1eb3b96211bd7c7335dbe116a1d765d7bae50..24baf55e90c98f39bab926e8c85a791eee5ed4a4 100644
--- a/python/paddle/v2/fluid/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -29,7 +29,11 @@ class TestL2DecayRegularizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
-        params_grads = append_backward_ops(mul_out)
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
@@ -62,7 +66,11 @@ class TestL1DecayRegularizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
-        params_grads = append_backward_ops(mul_out)
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
index a3cba92504a28590083df57e69f7662a887d94a6..9999165ed509aa40f31f26aa676f381561bd0016 100644
--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -7,12 +7,6 @@ import numpy as np
 import paddle.v2.fluid.core as core
 
 
-def create_tensor(np_data, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_data, place)
-    return tensor
-
-
 class RNNMemoryHelperOpTest(unittest.TestCase):
     def setUp(self):
         self.program = Program()
@@ -30,13 +24,13 @@ class RNNMemoryHelperOpTest(unittest.TestCase):
 
     def test_forward(self):
         x_np = np.random.normal(size=(2, 3)).astype("float32")
-        self.feed_map = {'X': create_tensor(x_np, self.place)}
+        self.feed_map = {'X': x_np}
         self.fetch_list = [self.Out]
         exe = Executor(self.place)
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(np.array(out[0]), x_np, rtol=1e-5)
+        self.assertTrue(np.allclose(out[0], x_np, rtol=1e-5))
 
 
 class RNNMemoryHelperGradOpTest(unittest.TestCase):
@@ -66,8 +60,7 @@ class RNNMemoryHelperGradOpTest(unittest.TestCase):
 
     def test_backward(self):
         self.feed_map = {
-            name: create_tensor(
-                np.random.normal(size=(2, 3)).astype("float32"), self.place)
+            name: np.random.normal(size=(2, 3)).astype("float32")
             for name in self.input_names
         }
         self.fetch_list = [self.output_vars['X@GRAD']]
@@ -76,7 +69,7 @@ class RNNMemoryHelperGradOpTest(unittest.TestCase):
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(np.array(out[0]), self.feed_map['Out@GRAD'], rtol=1e-5)
+        np.isclose(out[0], self.feed_map['Out@GRAD'], rtol=1e-5)
 
 
 class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
@@ -110,8 +103,7 @@ class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
 
     def test_backward(self):
         self.feed_map = {
-            name: create_tensor(
-                np.random.normal(size=(2, 3)).astype("float32"), self.place)
+            name: np.random.normal(size=(2, 3)).astype("float32")
             for name in ['X', 'Out']
         }
         self.fetch_list = [self.output_vars['X@GRAD']]
@@ -120,10 +112,9 @@ class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(
-            np.array(out[0]),
-            np.zeros(shape=(2, 3)).astype("float32"),
-            rtol=1e-5)
+        self.assertTrue(
+            np.allclose(
+                out[0], np.zeros(shape=(2, 3)).astype("float32"), rtol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_roi_pool_op.py b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a28d9c7f82d3735c410369eb61e350168c267cea
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -0,0 +1,123 @@
+import unittest
+import numpy as np
+import math
+import sys
+from op_test import OpTest
+
+
+class TestROIPoolOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_pool()
+
+        self.inputs = {'X': self.x, 'ROIs': self.rois}
+
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+
+        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
+
+    def init_test_case(self):
+        self.batch_size = 5
+        self.channels = 3
+        self.height = 6
+        self.width = 4
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 4.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.rois_num = 2
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def calc_roi_pool(self):
+        out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
+                             self.pooled_width))
+        argmax_data = np.zeros((self.rois_num, self.channels,
+                                self.pooled_height, self.pooled_width))
+
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = roi[0]
+            roi_start_w = int(round(roi[1] * self.spatial_scale))
+            roi_start_h = int(round(roi[2] * self.spatial_scale))
+            roi_end_w = int(round(roi[3] * self.spatial_scale))
+            roi_end_h = int(round(roi[4] * self.spatial_scale))
+
+            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
+            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
+
+            x_i = self.x[roi_batch_id]
+
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+
+            for c in range(self.channels):
+                for ph in range(self.pooled_height):
+                    for pw in range(self.pooled_width):
+                        hstart = int(math.floor(ph * bin_size_h))
+                        wstart = int(math.floor(pw * bin_size_w))
+                        hend = int(math.ceil((ph + 1) * bin_size_h))
+                        wend = int(math.ceil((pw + 1) * bin_size_w))
+
+                        hstart = min(max(hstart + roi_start_h, 0), self.height)
+                        hend = min(max(hend + roi_start_h, 0), self.height)
+                        wstart = min(max(wstart + roi_start_w, 0), self.width)
+                        wend = min(max(wend + roi_start_w, 0), self.width)
+
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        if is_empty:
+                            out_data[i, c, ph, pw] = 0
+                        else:
+                            out_data[i, c, ph, pw] = -sys.float_info.max
+
+                        argmax_data[i, c, ph, pw] = -1
+
+                        for h in range(hstart, hend):
+                            for w in range(wstart, wend):
+                                if x_i[c, h, w] > out_data[i, c, ph, pw]:
+                                    out_data[i, c, ph, pw] = x_i[c, h, w]
+                                    argmax_data[i, c, ph, pw] = h * \
+                                        self.width + w
+
+        self.outs = out_data.astype('float32')
+        self.argmaxes = argmax_data.astype('int64')
+
+    def make_rois(self):
+        rois = []
+        batch_ids = np.random.randint(0, self.batch_size, size=self.rois_num)
+        for i in range(self.rois_num):
+            x1 = np.random.random_integers(
+                0, self.width / self.spatial_scale - self.pooled_width)
+            y1 = np.random.random_integers(
+                0, self.height / self.spatial_scale - self.pooled_height)
+
+            x2 = np.random.random_integers(x1 + self.pooled_width,
+                                           self.width / self.spatial_scale)
+            y2 = np.random.random_integers(y1 + self.pooled_height,
+                                           self.height / self.spatial_scale)
+
+            roi = [batch_ids[i], x1, y1, x2, y2]
+            rois.append(roi)
+        self.rois = np.array(rois).astype("int64")
+
+    def setUp(self):
+        self.op_type = "roi_pool"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sequence_slice_op.py b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd9a05343b0c4aa05b258959665c0662f271512
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
@@ -0,0 +1,47 @@
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+
+
+class TestSequenceSliceOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        # only supprot one level LoD
+        x = np.random.random(self.x_dim).astype('float32')
+        lod = self.x_lod
+        offset = np.array(self.offset).astype("int64")
+        length = np.array(self.length).astype("int64")
+
+        self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
+        outs = []  #np.zeros((100, 3, 2)).astype('float32')
+        out_lod = [[0]]
+        out_lod_offset = 0
+        for i in range(len(offset)):
+            sub_x = x[lod[0][i] + offset[i, 0]:lod[0][i] + offset[i, 0] +
+                      length[i, 0], :]
+            out_lod_offset = out_lod_offset + len(sub_x)
+            outs.append(sub_x)
+            out_lod[0].append(out_lod_offset)
+        outs = np.concatenate(outs, axis=0)
+        self.outputs = {'Out': (outs, out_lod)}
+
+    def init_test_case(self):
+        self.x_dim = (100, 3, 2)
+        self.x_lod = [[0, 20, 40, 60, 80, 100]]
+        self.offset = [[1], [2], [3], [4], [5]]
+        self.length = [[10], [8], [6], [4], [2]]
+
+    def setUp(self):
+        self.op_type = "sequence_slice"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 1a3b88e18e38b88d75ad17a0bb6a2965d1e60406..86db4c64b493d94cc675ed4bcee7e2925fef1977 100644
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -3,13 +3,15 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
+main_program = default_main_program()
+
 
 class TestShrinkRNNMemory(unittest.TestCase):
     def test_shrink_rnn_memory(self):
-        x = layers.data('x', shape=[100], data_type='float32')
+        x = layers.data('x', shape=[100], dtype='float32')
         x.stop_gradient = False
         table = layers.lod_rank_table(x=x)
         i = layers.zeros(dtype='int64', shape=[1])
@@ -27,19 +29,16 @@ class TestShrinkRNNMemory(unittest.TestCase):
         tensor_np = numpy.random.random(size=(3, 100)).astype('float32')
         tensor.set(tensor_np, cpu)
         exe = Executor(cpu)
-        outs = map(numpy.array,
-                   exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3]))
+        outs = exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3])
         self.assertTrue(numpy.allclose(tensor_np[0:3], outs[0]))
         self.assertTrue(numpy.allclose(tensor_np[0:2], outs[1]))
         self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
 
         mem3_mean = layers.mean(x=mem3)
         append_backward_ops(loss=mem3_mean)
-        x_grad = map(numpy.array,
-                     exe.run(feed={'x': tensor},
-                             fetch_list=[
-                                 g_main_program.global_block().var('x@GRAD')
-                             ]))[0]
+        x_grad = exe.run(
+            feed={'x': tensor},
+            fetch_list=[main_program.global_block().var('x@GRAD')])[0]
         self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
 
 
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
index 3aed83b2ea3418c54f9540279ae6e2e0045421fa..f5da4e408f0a83dbf6da530b478e91bbf9cd5ab2 100644
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -98,7 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
 
         exe = Executor(place)
         scope = core.Scope()
-        exe.run(program, feed={'x': tensor, 'y': mask}, scope=scope)
+        exe.run(program,
+                feed={'x': tensor,
+                      'y': mask},
+                scope=scope,
+                return_numpy=False)
 
         var_true = scope.find_var(out_true.name).get_tensor()
 
@@ -123,13 +127,13 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
         x = layers.data(
             name='x',
             shape=[1],
-            data_type='float32',
+            dtype='float32',
             main_program=program,
             stop_gradient=False)
         y = layers.data(
             name='y',
             shape=[1],
-            data_type='bool',
+            dtype='bool',
             main_program=program,
             stop_gradient=False)
 
@@ -169,7 +173,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
                                     feed={'x': tensor,
                                           'y': mask},
                                     fetch_list=[g_vars],
-                                    scope=scope))
+                                    scope=scope,
+                                    return_numpy=False))
         ]
 
         g_out_sum = np.array(g_out).sum()
diff --git a/python/paddle/v2/fluid/tests/test_tensor_array.py b/python/paddle/v2/fluid/tests/test_tensor_array.py
deleted file mode 100644
index d6929ba16e4dae0c57adcceb4f0e78c094eee55c..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_tensor_array.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import logging
-import paddle.v2.fluid.core as core
-import unittest
-import numpy as np
-
-
-class TestTensorArray(unittest.TestCase):
-    def setUp(self):
-        self.ta = core.TensorArray()
-
-        self.batch_size = 10
-        self.dim = 2
-
-        # create a LoDTensor
-        self.scope = core.Scope()
-        var = self.scope.var("test_tensor")
-        self.place = core.CPUPlace()
-        tensor = var.get_tensor()
-        tensor.set_dims([self.batch_size, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        tensor_array[0, 0] = 0
-        tensor_array[1, 0] = 1
-        tensor_array[2, 0] = 2
-        tensor_array[3, 0] = 3
-        tensor_array[4, 0] = 4
-        tensor_array[5, 0] = 5
-        tensor_array[6, 0] = 6
-        tensor_array[7, 0] = 7
-        tensor_array[8, 0] = 8
-        tensor_array[9, 0] = 9
-
-        lod_py = [[0, 2, 5, 10]]
-        lod_tensor = core.LoDTensor(lod_py)
-        lod_tensor.set(tensor_array, self.place)
-
-        self.py_seq_meta = [[5, 10, 2], [2, 5, 1], [0, 2, 0]]
-
-        self.tensor = lod_tensor
-
-    def test_unstack(self):
-        self.ta.unstack(self.tensor)
-        self.assertEqual(self.tensor.get_dims()[0], self.ta.size())
-
-    def test_read(self):
-        self.ta.unstack(self.tensor)
-        for i in range(self.batch_size):
-            tensor = self.ta.read(i)
-
-    def test_write(self):
-        self.ta.unstack(self.tensor)
-
-        # create a tensor with shape of [1, self.dim]
-        var = self.scope.var("hell")
-        tensor = var.get_tensor()
-        tensor.set_dims([1, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        for i in range(self.dim):
-            tensor_array[0, i] = i
-        tensor.set(tensor_array, self.place)
-
-        self.ta.write(2, tensor)
-
-        ta_tensor = self.ta.read(2)
-        ta_tensor_array = np.array(ta_tensor)
-        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
-        self.assertTrue((tensor_array == ta_tensor_array).all())
-
-    def test_write_shared(self):
-        self.ta.unstack(self.tensor)
-
-        # create a tensor with shape of [1, self.dim]
-        var = self.scope.var("hell")
-        tensor = var.get_tensor()
-        tensor.set_dims([1, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        for i in range(self.dim):
-            tensor_array[0, i] = i
-        tensor.set(tensor_array, self.place)
-
-        self.ta.write_shared(2, tensor)
-
-        ta_tensor = self.ta.read(2)
-        ta_tensor_array = np.array(ta_tensor)
-        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
-        self.assertTrue((tensor_array == ta_tensor_array).all())
-
-    def test_unpack(self):
-        meta = self.ta.unpack(self.tensor, 0, True)
-        self.assertEqual(self.ta.size(), 5)
-        self.assertEqual(meta, self.py_seq_meta)
-
-    def test_pack(self):
-        meta = self.ta.unpack(self.tensor, 0, True)
-        print "meta", meta
-        tensor = self.ta.pack(0, meta, self.tensor.lod())
-        print np.array(self.tensor)
-        print np.array(tensor)
-        self.assertTrue((np.array(self.tensor) == np.array(tensor)).all())
-        self.assertTrue(tensor.lod(), self.tensor.lod())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e87f283042c081ed9f232d140ff8c303cd3d1858
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -0,0 +1,83 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
+
+    return out
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], hsize))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * wsize + \
+                                c_start + arg % self.ksize[1]
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [6, 4, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
index a3e60a751719666bdca56a3096b688125d09f4b2..f1e4c0ba21d5c4f10d2b5011bdb5abaebaec5431 100644
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.fluid.framework import Variable, g_main_program, Program
+from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.v2.fluid.core as core
 import numpy as np
 
@@ -7,7 +7,7 @@ import numpy as np
 class TestVariable(unittest.TestCase):
     def test_np_dtype_convert(self):
         DT = core.DataType
-        convert = Variable._convert_np_dtype_to_dtype_
+        convert = convert_np_dtype_to_dtype_
         self.assertEqual(DT.FP32, convert(np.float32))
         self.assertEqual(DT.FP16, convert("float16"))
         self.assertEqual(DT.FP64, convert("float64"))
@@ -18,17 +18,17 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(ValueError, lambda: convert("int8"))
 
     def test_var(self):
-        b = g_main_program.current_block()
+        b = default_main_program().current_block()
         w = b.create_var(
             dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
         self.assertNotEqual(str(w), "")
-        self.assertEqual(core.DataType.FP64, w.data_type)
+        self.assertEqual(core.DataType.FP64, w.dtype)
         self.assertEqual((784, 100), w.shape)
         self.assertEqual("fc.w", w.name)
         self.assertEqual(0, w.lod_level)
 
         w = b.create_var(name='fc.w')
-        self.assertEqual(core.DataType.FP64, w.data_type)
+        self.assertEqual(core.DataType.FP64, w.dtype)
         self.assertEqual((784, 100), w.shape)
         self.assertEqual("fc.w", w.name)
         self.assertEqual(0, w.lod_level)
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
index 84b432333f950f754a97bc1a051b59c16fb22aed..033b03a4957131e1155c61e8ed2f10eefb23fda4 100644
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -9,11 +9,11 @@ import numpy
 class TestWhileOp(unittest.TestCase):
     def test_simple_forward(self):
         d0 = layers.data(
-            "d0", shape=[10], append_batch_size=False, data_type='float32')
+            "d0", shape=[10], append_batch_size=False, dtype='float32')
         d1 = layers.data(
-            "d1", shape=[10], append_batch_size=False, data_type='float32')
+            "d1", shape=[10], append_batch_size=False, dtype='float32')
         d2 = layers.data(
-            "d2", shape=[10], append_batch_size=False, data_type='float32')
+            "d2", shape=[10], append_batch_size=False, dtype='float32')
         i = layers.zeros(shape=[1], dtype='int64')
         i.stop_gradient = True
         init = layers.zeros(shape=[10], dtype='float32')
@@ -55,19 +55,10 @@ class TestWhileOp(unittest.TestCase):
         for i in xrange(3):
             d.append(numpy.random.random(size=[10]).astype('float32'))
 
-        d_tensor = []
-        for item in d:
-            t = core.LoDTensor()
-            t.set(item, cpu)
-            d_tensor.append(t)
-
-        outs = map(numpy.array,
-                   exe.run(feed={
-                       'd0': d_tensor[0],
-                       'd1': d_tensor[1],
-                       'd2': d_tensor[2]
-                   },
-                           fetch_list=[sum_result]))
+        outs = exe.run(feed={'d0': d[0],
+                             'd1': d[1],
+                             'd2': d[2]},
+                       fetch_list=[sum_result])
         self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)