diff --git a/Dockerfile b/Dockerfile
index 42a103240e882b2732f14619308cc00f010d20af..b92ac228a8d50da93f8c0bfe2f6af31fc784f2c7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
 
-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
+
 RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
 RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
 RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 
-RUN pip3 --no-cache-dir install coverage                
-RUN pip3.6 --no-cache-dir install coverage             
-RUN pip3.7 --no-cache-dir install coverage            
+RUN pip3 --no-cache-dir install coverage
+RUN pip3.6 --no-cache-dir install coverage
+RUN pip3.7 --no-cache-dir install coverage
 RUN pip --no-cache-dir install coverage
 
 COPY ./python/requirements.txt /root/
@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 5a889dbc3143833ff48a972d17efc0aaf63f1810..3e0a3bf8f5ba9e1570295dc4b85297ddea7c4a14 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST)
             SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
             DSTS ${dst_dir} ${dst_dir}/lib)
 
+    if (WITH_CRYPTO)
         set(dst_dir "${DST}/third_party/install/cryptopp")
         copy(${TARGET}
-        SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
+            SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib)
+    endif()
 
     set(dst_dir "${DST}/third_party/install/xxhash")
     copy(${TARGET}
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 0bd6a79b55392e8bfb8f33b0a29b4cf1df0d44dc..5574a55e18c6d9806cb878dc69ec597f81da97d8 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item,
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
-      dst_item->ShareDataWith(src_item);
+      TensorCopy(src_item, platform::CPUPlace(), dst_item);
     }
   } else {
     dst_item->clear();
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 4b984210ed18d9f51d9485616d1c28871d936237..551d1342edeb335d1cad4782f85ae9f94f8739bd 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -113,7 +113,9 @@ message DistributedStrategy {
   optional bool fuse_all_reduce_ops = 18 [ default = true ];
   optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
   optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
-  // optional bool enable_backward_optimizer_op_deps = 19 [ default = true ];
+  optional bool cudnn_exhaustive_search = 21 [ default = true ];
+  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
+  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 0ffa38a037ea431c8c860da8db14c799d1524c0d..2a85c60305bd36e78c071f5703885c23e33b403e 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -29,14 +29,20 @@ namespace framework {
 namespace compatible {
 
 struct OpUpdateRecord {
-  enum class Type { kInvalid = 0, kModifyAttr, kNewAttr };
+  enum class Type {
+    kInvalid = 0,
+    kModifyAttr,
+    kNewAttr,
+    kNewInput,
+    kNewOutput
+  };
   Type type_;
   std::string remark_;
 };
 
 struct ModifyAttr : OpUpdateRecord {
   ModifyAttr(const std::string& name, const std::string& remark,
-             boost::any default_value)
+             const boost::any& default_value)
       : OpUpdateRecord({Type::kModifyAttr, remark}),
         name_(name),
         default_value_(default_value) {
@@ -47,9 +53,10 @@ struct ModifyAttr : OpUpdateRecord {
   std::string name_;
   boost::any default_value_;
 };
+
 struct NewAttr : OpUpdateRecord {
   NewAttr(const std::string& name, const std::string& remark,
-          boost::any default_value)
+          const boost::any& default_value)
       : OpUpdateRecord({Type::kNewAttr, remark}),
         name_(name),
         default_value_(default_value) {}
@@ -59,6 +66,22 @@ struct NewAttr : OpUpdateRecord {
   boost::any default_value_;
 };
 
+struct NewInput : OpUpdateRecord {
+  NewInput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewInput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
+struct NewOutput : OpUpdateRecord {
+  NewOutput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
 class OpVersionDesc {
  public:
   OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
@@ -75,6 +98,18 @@ class OpVersionDesc {
     return *this;
   }
 
+  OpVersionDesc& NewInput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewInput(name, remark)));
+    return *this;
+  }
+
+  OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewOutput(name, remark)));
+    return *this;
+  }
+
  private:
   std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
 };
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index 77891dafc81b3a96af28cb480f1620543caab0b8..052bf3a4b882be749e70704f18f09a7b24551ed7 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -42,7 +42,14 @@ TEST(test_operator_version, test_operator_version) {
               "height",
               "In order to represent a two-dimensional rectangle, the "
               "parameter height is added.",
-              0));
+              0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a input [X2] and a output [Y2]
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .NewInput("X2", "The second input.")
+              .NewOutput("Y2", "The second output."));
 }
 }  // namespace compatible
 }  // namespace framework
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 1f2734eece578f7ec266a6f31cd46b373f010fc1..98554ed04976670c1a846cbeab69815417c0a998 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32)
     SRCS analyzer_tester.cc
     EXTRA_DEPS reset_tensor_array paddle_fluid_shared
     ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
-elseif(NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
-  inference_analysis_test(test_analyzer
-    SRCS analyzer_tester.cc
-    EXTRA_DEPS reset_tensor_array paddle_inference_api
-    ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
+elseif(WIN32)
+    inference_analysis_test(test_analyzer
+      SRCS analyzer_tester.cc
+      EXTRA_DEPS reset_tensor_array paddle_inference_api
+      ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 30e8386f4c86e308372b5dd6328c7d3785a073b1..fb0ad31a3e612201de32813a65970c73b73b611b 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -54,8 +54,7 @@ if(WITH_TESTING)
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
     set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
-  elseif(NOT WIN32)
-    # TODO: Fix this unittest failed on Windows
+  elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
@@ -67,8 +66,7 @@ endif()
 if (NOT APPLE AND NOT WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
-elseif (NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
+elseif (WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 9a3a73f6c946d76f0da1b18feb8e7d61c0bf59b6..1b79c77c69e162a6f96a1762a4949386a7dadde4 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML)
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -191,6 +192,7 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
+set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150) 
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 72816f6d0317600ad6bf8ffc4ad31bd1a23d7c30..b9f979f96d4b106642795151fb8e34b025b2caef 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -32,19 +32,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
       ${EXTERNAL_PROJECT_NAME}
       ${EXTERNAL_PROJECT_LOG_ARGS}
       PREFIX                ${INSTALL_DIR}
-      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-                            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
+      URL                   ${URL}/${FILENAME}
       DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_EXTRACT   1
       DOWNLOAD_NO_PROGRESS  1
       CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
+      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
+                            ${CMAKE_COMMAND} -E tar xzf ${FILENAME}
       UPDATE_COMMAND        ""
       INSTALL_COMMAND       ""
   )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
   inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index eca8246533fea4df5a057252ce8a79aef2bfe565..7aaaa0002c5ab31af72c75e69f5a283c09633ba4 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/affine_grid_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_info.h"
+
 namespace paddle {
 namespace operators {
 
@@ -84,14 +86,14 @@ __global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
 
     int theta_offset = n * 6;  // 2 * 3;
     T out_grad_x = out_grad[index * 2];
-    atomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
-    atomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
-    atomicAdd(theta_grad + theta_offset + 2, out_grad_x);
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
 
     T out_grad_y = out_grad[index * 2 + 1];
-    atomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
-    atomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
-    atomicAdd(theta_grad + theta_offset + 5, out_grad_y);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
   }
 }
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 7e1e7b1e6929a6fce3a315b4e4711794bc6649b7..999f990448ca6370dadacbdaee5bf3bcadcaca0e 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -31,7 +31,7 @@ static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
                                                   int sW, int H, int W,
                                                   T delta) {
   if (in_bounds(h, w, H, W)) {
-    atomicAdd(data + h * sH + w * sW, delta);
+    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
   }
 }
 
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 354e5c60a6b9ed80f0f8c44439294bfa2731a423..7749903e5f36f1d93f7e111da4587d6828d445a4 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -111,7 +111,8 @@ static void CallPythonFunc(py::object *callable,
       out->set_lod(py_out_tensor->lod());
       out->ShareDataWith(*py_out_tensor);
     } catch (py::cast_error &) {
-      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The %d-th output must be LoDTensor.", i));
     }
   }
 }
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..57891699fd2ad73a1cccce26438528657afdf340
--- /dev/null
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -0,0 +1,515 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdio.h>
+#include <cstdio>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<paddle::platform::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
+                 paddle::platform::float16> {};
+}  // namespace cub
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+// Iter using into a column
+struct ColumnIndexIter {
+  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
+      const Eigen::array<int, 1>& ix) const {
+    return ix[0] % num_cols_;
+  }
+
+  int num_cols_;
+};
+
+inline static int GetDesiredBlockDim(int dim) {
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+template <typename T>
+__global__ void InitIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
+    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+template <typename T>
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
+
+  __device__ __forceinline__ void set(T value, int64_t id) {
+    v = value;
+    id = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair<T>& in) {
+    v = in.v;
+    id = in.id;
+  }
+
+  __device__ __forceinline__ bool operator<(const T value) const {
+    return (v < value);
+  }
+
+  __device__ __forceinline__ bool operator>(const T value) const {
+    return (v > value);
+  }
+  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
+    return (v < in.v) || ((v == in.v) && (id > in.id));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
+    return (v > in.v) || ((v == in.v) && (id < in.id));
+  }
+
+  T v;
+  int64_t id;
+};
+
+template <typename T>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
+                                      int beam_size, const bool& largest) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (largest) {
+      if (topk[k] < p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    } else {
+      if (topk[k] > p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, int beam_size,
+                                        const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, const Pair<T>& max,
+                                        int beam_size, const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp < max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp > max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
+                                              int beam_size, const T* src,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
+                                              const int tid, bool largest) {
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
+      GetTopK<T, BlockSize>(topk, src, tid, dim, length, largest);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
+        } else {
+          topk[k].set(-static_cast<T>(INFINITY), -1);
+        }
+      }
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
+                              length, largest);
+      }
+    }
+
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
+    *beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
+                                            Pair<T> topk[], T** topVal,
+                                            int64_t** topIds, int* beam, int* k,
+                                            const int tid, const int warp,
+                                            const bool& largest) {
+  while (true) {
+    __syncthreads();
+    if (tid < BlockSize / 2) {
+      if (largest) {
+        if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      } else {
+        if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      }
+    }
+    __syncthreads();
+    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (largest) {
+          if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        } else {
+          if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = sh_topk[maxid[0]].v;
+      **topIds = sh_topk[maxid[0]].id;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
+    __syncthreads();
+
+    if (tid == maxid[0]) {
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
+      }
+    }
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
+    if (maxid[0] / 32 == warp) {
+      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
+          MaxLength)
+        break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top MaxLength value;
+ * 2. merge to sh_topk, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topk value is null;
+ * 4. go to the first setp, until get the topk value.
+ */
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
+                             const T* src, int lds, int dim, int k,
+                             int grid_dim, int num, bool largest = true) {
+  __shared__ Pair<T> sh_topk[BlockSize];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+
+  const int bid = blockIdx.x;
+  for (int i = bid; i < num; i += grid_dim) {
+    int top_num = k;
+    __shared__ int maxid[BlockSize / 2];
+    T* out = output + i * output_stride;
+    int64_t* inds = indices + i * k;
+    Pair<T> topk[MaxLength];
+    int beam = MaxLength;
+    Pair<T> max;
+    bool is_empty = false;
+    bool firststep = true;
+
+    for (int j = 0; j < MaxLength; j++) {
+      if (largest) {
+        topk[j].set(-static_cast<T>(INFINITY), -1);
+      } else {
+        topk[j].set(static_cast<T>(INFINITY), -1);
+      }
+    }
+    while (top_num) {
+      ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k, src + i * lds,
+                                             &firststep, &is_empty, &max, dim,
+                                             tid, largest);
+
+      sh_topk[tid] = topk[0];
+      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
+                                           &beam, &top_num, tid, warp, largest);
+    }
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
+                           size_t rows, size_t cols, size_t k) {
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t j = 0; j < cols; ++j) {
+      x_grad[i * cols + j] = 0;
+    }
+    for (size_t j = 0; j < k; ++j) {
+      size_t idx = indices[i * k + j];
+      x_grad[i * cols + idx] = out_grad[i * k + j];
+    }
+  }
+}
+
+// the grad assign with the axis
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices,
+                                   T* grad_in, int pre, int post,
+                                   int raw_height, int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    const int& base_index = i * post * k;
+    const int& base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      const int64_t idx_ij = indices[base_index + j];
+      const int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[idx_ij];
+    }
+  }
+}
+// use the radix sort for the topk
+template <typename T>
+bool SortTopk(const platform::CUDADeviceContext& ctx,
+              const framework::Tensor* input_tensor, const int64_t num_cols,
+              const int64_t num_rows, const int k,
+              framework::Tensor* out_tensor, framework::Tensor* indices_tensor,
+              bool largest = true) {
+  auto cu_stream = ctx.stream();
+
+  Tensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = framework::make_ddim(dims);
+  input_indices.Resize(dim);
+  // input_indices.Resize(num_rows*num_cols);
+  input_indices.mutable_data<int64_t>(ctx.GetPlace());
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(num_cols);
+
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  // actually, int num_rows < max_grid_size
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  // Init a index array
+  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+
+  // create iter for counting input
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+
+  Tensor temp_values;
+  Tensor temp_indices;
+
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
+
+  if (k == num_cols) {
+    // Doing a full sort.
+    sorted_values_ptr = values;
+    sorted_indices_ptr = indices;
+  } else {
+    temp_values.Resize(dim);
+    temp_indices.Resize(dim);
+    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
+    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
+  }
+
+  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
+  // time.
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR)
+          << "TopKOP failed as could not launch "
+             "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+             "temp_storage_bytes, status: "
+          << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to calculate "
+                    "temp_storage_bytes, status: "
+                 << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  Tensor temp_storage;
+  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
+
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  auto& dev = *ctx.eigen_device();
+  if (k < num_cols) {
+    // copy sliced data to output.
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
+    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
+
+    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
+    auto dim = framework::make_ddim(odims);
+    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
+
+    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
+    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+  }
+  return true;
+}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 82ecc2887ba240560cf15165f21bc995f4683159..d8b2e92616091a8c822c6fd0bfdfb1148c25534d 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -12,474 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
 #include <cstdio>
+#include <vector>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 // set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T>
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
-
-  __device__ __forceinline__ void set(T value, int64_t id) {
-    v = value;
-    id = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair<T>& in) {
-    v = in.v;
-    id = in.id;
-  }
-
-  __device__ __forceinline__ bool operator<(const T value) const {
-    return (v < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
-    return (v < in.v) || ((v == in.v) && (id > in.id));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
-    return (v > in.v) || ((v == in.v) && (id < in.id));
-  }
-
-  T v;
-  int64_t id;
-};
-
-template <typename T>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
-                                      int beam_size) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int beam_size>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* src,
-                                              bool* firstStep, bool* is_empty,
-                                              Pair<T>* max, int dim,
-                                              const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - (*beam)) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* val,
-                                              int* col, bool* firstStep,
-                                              bool* is_empty, Pair<T>* max,
-                                              int dim, const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - *beam) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
-                                            Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int* beam, int* k,
-                                            const int tid, const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < BlockSize / 2) {
-      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
-        maxid[tid] = tid + BlockSize / 2;
-      } else {
-        maxid[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
-          maxid[tid] = maxid[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = sh_topk[maxid[0]].v;
-      **topIds = sh_topk[maxid[0]].id;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxid[0]) (*beam)++;
-    if (--(*k) == 0) break;
-    __syncthreads();
-
-    if (tid == maxid[0]) {
-      if (*beam < MaxLength) {
-        sh_topk[tid] = topk[*beam];
-      }
-    }
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (maxid[0] / 32 == warp) {
-      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
-          MaxLength)
-        break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top MaxLength value;
- * 2. merge to sh_topk, block reduce and get max value;
- * 3. go to the second setp, until one thread's topk value is null;
- * 4. go to the first setp, until get the topk value.
- */
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k,
-                             int grid_dim, int num) {
-  __shared__ Pair<T> sh_topk[BlockSize];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-
-  const int bid = blockIdx.x;
-  for (int i = bid; i < num; i += grid_dim) {
-    int top_num = k;
-    __shared__ int maxid[BlockSize / 2];
-    T* out = output + i * output_stride;
-    int64_t* inds = indices + i * k;
-    Pair<T> topk[MaxLength];
-    int beam = MaxLength;
-    Pair<T> max;
-    bool is_empty = false;
-    bool firststep = true;
-
-    for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-static_cast<T>(INFINITY), -1);
-    }
-    while (top_num) {
-      ThreadGetTopK<T, MaxLength, BlockSize>(
-          topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
-
-      sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
-                                           &beam, &top_num, tid, warp);
-    }
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
-                           size_t rows, size_t cols, size_t k) {
-  for (size_t i = 0; i < rows; ++i) {
-    for (size_t j = 0; j < cols; ++j) {
-      x_grad[i * cols + j] = 0;
-    }
-    for (size_t j = 0; j < k; ++j) {
-      size_t idx = indices[i * k + j];
-      x_grad[i * cols + idx] = out_grad[i * k + j];
-    }
-  }
-}
-
-inline static int GetDesiredBlockDim(int dim) {
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-// Iter using into a column
-struct ColumnIndexIter {
-  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
-      const Eigen::array<int, 1>& ix) const {
-    return ix[0] % num_cols_;
-  }
-
-  int num_cols_;
-};
-
-__global__ void InitIndex(int64_t* indices, int64_t num_rows,
-                          int64_t num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
-    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T>
-bool SortTopk(const platform::CUDADeviceContext& ctx,
-              const framework::Tensor* input_tensor, const int64_t num_cols,
-              const int64_t num_rows, const int k,
-              framework::Tensor* out_tensor,
-              framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = framework::make_ddim(dims);
-  input_indices.Resize(dim);
-  // input_indices.Resize(num_rows*num_cols);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](int col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
-  // actually, int num_rows < max_grid_size
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  // Init a index array
-  InitIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-
-  // create iter for counting input
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-
-  Tensor temp_values;
-  Tensor temp_indices;
-
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-
-  if (k == num_cols) {
-    // Doing a full sort.
-    sorted_values_ptr = values;
-    sorted_indices_ptr = indices;
-  } else {
-    temp_values.Resize(dim);
-    temp_indices.Resize(dim);
-    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  }
-
-  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
-  // time.
-  auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
-           "temp_storage_bytes, status: "
-        << cudaGetErrorString(err);
-    return false;
-  }
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-           "temp_storage_bytes: "
-        << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-  auto& dev = *ctx.eigen_device();
-  if (k < num_cols) {
-    // copy sliced data to output.
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
-    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
-
-    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
-    auto dim = framework::make_ddim(odims);
-    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
-
-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
-  }
-  return true;
-}
-
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
   case (dim): {                        \
     constexpr auto kBlockDim = (dim);  \
@@ -523,7 +70,6 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
         framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
     const int64_t input_width = inputdims[inputdims.size() - 1];
     const auto& dev_ctx = ctx.cuda_device_context();
-
     if ((input_width <= 1024 || k >= 128 || k == input_width)) {
       if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
                       indices)) {
@@ -576,7 +122,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
         framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1));
     const size_t col = xdims[xdims.size() - 1];
     const auto& dev_ctx = context.cuda_device_context();
-
     const int kMaxHeight = 2048;
     int gridx = row < kMaxHeight ? row : kMaxHeight;
     switch (GetDesiredBlockDim(col)) {
@@ -595,7 +140,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
 
 }  // namespace operators
 }  // namespace paddle
-
 REGISTER_OP_CUDA_KERNEL(
     top_k,
     paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc72d83411f5a34561a75e7e75f98077ee5a4e5d
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/top_k_v2_op.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class TopkV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of TopkOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int& dim_size = input_dims.size();
+    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
+                      "the axis of topk"
+                      "must be [-%d, %d), but you set axis is %d",
+                      dim_size, dim_size, axis);
+
+    if (axis < 0) axis += dim_size;
+
+    PADDLE_ENFORCE_GE(
+        k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
+    PADDLE_ENFORCE_GE(input_dims.size(), 1,
+                      "input of topk must have >= 1d shape");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GE(
+          input_dims[axis], k,
+          "input of topk op must have >= %d columns in axis of %d", k, axis);
+    }
+
+    framework::DDim dims = input_dims;
+
+    dims[axis] = k;
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Topk op");
+    AddInput("K",
+             "(Tensor)  Number of top elements to look for along "
+             "the last dimension (along each row for matrices).")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddComment(R"DOC(
+Top K operator
+
+If the input is a vector (1d tensor), this operator finds the k largest 
+entries in the vector and outputs their values and indices as vectors. 
+Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+
+For matrices, this operator computes the top k entries in each row. )DOC");
+    AddAttr<int>("k",
+                 "(int, default 1) Number of top elements to look for along "
+                 "the tensor).")
+        .SetDefault(1);
+    AddAttr<int>("axis",
+                 "the axis to sort and get the k indices, value."
+                 "if not set, will get k value in last axis.")
+        .SetDefault(-1);
+    AddAttr<bool>("largest",
+                  "control flag whether to return largest or smallest")
+        .SetDefault(true);
+    AddAttr<bool>("sorted",
+                  "control flag whether to return elements in sorted order")
+        .SetDefault(true);
+  }
+};
+
+class TopkV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument("Input(X) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Indices"), true,
+        platform::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Grad Input(Out) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename T>
+class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("top_k_v2_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Output("Indices"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
+                  ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(top_k_v2,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
+
+REGISTER_OP_CPU_KERNEL(
+    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5154503292014fa43efae919f20d731060b9db57
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.cu
@@ -0,0 +1,272 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/p_norm_op.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/fluid/operators/top_k_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename DeviceContext, typename T>
+class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+
+    // get the attributes
+    int k = static_cast<int>(ctx.Attr<int>("k"));
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
+    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
+
+    // get the input dims
+    const auto& in_dims = input->dims();
+    // calcluate the real axis
+    if (axis < 0) axis += in_dims.size();
+
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      Tensor k_host;
+      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
+      k = k_host.data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    const auto& out_dims = output->dims();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    if (axis == in_dims.size() - 1) {
+      // if get the topK from the last axis
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      const auto& dev_ctx = ctx.cuda_device_context();
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
+                        indices, largest)) {
+          // Successed, return.
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      // NOTE: pass lds and dim same to input width.
+      // NOTE: old matrix implementation of stride is different to eigen.
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                output_data, k, indices_data, input_data, input_width,
+                input_width, static_cast<int>(k), gridx, input_height,
+                largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+    } else {
+      // if get topK not from the last axis, will tranpose the tensor and get
+      // TopK
+
+      // first step, prepare the trans args for the tranpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (int i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+      // second step, tranpose the input
+      Tensor trans_input;
+      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
+      int ndims = trans.size();
+      const auto& dev_ctx = ctx.cuda_device_context();
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
+                                                   &trans_input, trans);
+      // third step, calcluate the topk
+      // allocate the tmp cuda memory for the tmp result
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
+      Tensor trans_out;
+      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
+                        &trans_out, &trans_ind, largest)) {
+          // last step, tranpose back the indices and output
+          TransCompute<platform::CUDADeviceContext, int64_t>(
+              ndims, dev_ctx, trans_ind, indices, trans);
+          TransCompute<platform::CUDADeviceContext, T>(
+              ndims, dev_ctx, trans_out, output, trans);
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
+                trans_input.data<T>(), input_width, input_width,
+                static_cast<int>(k), gridx, input_height, largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+
+      // last step, tranpose back the indices and output
+      TransCompute<platform::CUDADeviceContext, int64_t>(
+          ndims, dev_ctx, trans_ind, indices, trans);
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
+                                                   output, trans);
+    }
+  }
+};
+
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+template <typename DeviceContext, typename T>
+class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = context.Attr<int>("axis");
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // get the real the axis and the k
+    if (axis < 0) axis += in_dims.size();
+    const int& k = out_dims[axis];
+    const int& raw_height = in_dims[axis];
+
+    // allocate the cuda memory for the x_grad
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+
+    int pre, n, post;
+    GetDims(in_dims, axis, &pre, &n, &post);
+
+    // calcluate the block and grid num
+    auto& dev_ctx = context.cuda_device_context();
+    auto ComputeBlockSize = [](int col) {
+      if (col > 512)
+        return 1024;
+      else if (col > 256 && col <= 512)
+        return 512;
+      else if (col > 128 && col <= 256)
+        return 256;
+      else if (col > 64 && col <= 128)
+        return 128;
+      else
+        return 64;
+    };
+    int block_size = ComputeBlockSize(post * k);
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+    int grid_size = std::min(max_blocks, pre);
+
+    // lanuch the cuda kernel to assign the grad
+    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          float>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          double>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
+                       paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int64_t>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a77285d123644e6ea2b9077f3338b92add42f7f0
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.h
@@ -0,0 +1,321 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+  The reason why we need the topk v2 is because the compatibility. We redefine
+  the NaN is maximum value
+  in the process of comparing. If do not add the topk v2,  will affect the
+  inference result of model that traing
+  by the older version paddlepaddle.
+*/
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height, Type input_width, int input_dim,
+                     const framework::Tensor* input, T* t_out, Type* t_indices,
+                     const int& k, const bool& largest, const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(), col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height, const Type& input_width,
+                           const int& input_dim, const framework::Tensor* input,
+                           const framework::Tensor* indices, T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class TopkV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Get the top k elements of each row of input tensor
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto* indices = context.Output<Tensor>("Indices");
+    const auto& in_dims = input->dims();
+    int k = static_cast<int>(context.Attr<int>("k"));
+    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
+    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
+
+    // axis < 0, cacluate the real axis
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    if (axis < 0) axis += in_dims.size();
+
+    // if K tensor is not null, will the use K tesnor as k
+    auto* k_t = context.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      // accroding to axis to set K value in the dim
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    const auto& out_dims = output->dims();
+    if (axis + 1 == in_dims.size()) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
+                           output_data, indices_data, k, largest, sorted);
+    } else {
+      // if the topk dims is not last dim, will tranpose and do topk
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.push_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      // get the trans input_dims, out_dims
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+      }
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+
+      Tensor trans_inp;
+      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // transpose the input value
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
+                                                  &trans_inp, trans);
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      // Allocate the temp tensor to the save the topk indices, values
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
+      Tensor tmp_indices;
+      auto* t_ind =
+          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
+
+      // get the TopK value
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
+                           &trans_inp, t_out, t_ind, k, largest, sorted);
+      // transpose back
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, tmp_indices, indices, trans);
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  output, trans);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TopkV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // axis < 0, get the real axis
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+    const size_t& k = out_dims[axis];
+
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    if (axis + 1 == in_dims.size()) {
+      // allocate the memory for the input_grad
+
+      // assign the out_grad to input_grad directly
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t input_width = in_dims[in_dims.size() - 1];
+
+      // init the output grad with 0, because some input elements has no grad
+      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+      // Assign the output_grad to input_grad
+      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
+                     indices, x_grad_data, k);
+    } else {
+      // can not assign grad to input_grad, must do the transpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(out_dims.size() - 1);
+      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+      framework::DDim trans_dims(out_dims);
+      framework::DDim trans_in_dims(in_dims);
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = out_dims[trans[i]];
+        trans_in_dims[i] = in_dims[trans[i]];
+      }
+      // transpose the out_grad, indices
+      Tensor trans_dO;
+      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // Do transpose
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
+                                                  &trans_dO, trans);
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, *indices, &trans_ind, trans);
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+      // Assign the out_grad to tranpose input_grad
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
+      memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
+                                 &trans_dO, &trans_ind, t_out, k);
+
+      // Transpose back
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  x_grad, trans);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index c141033b2b3e6b9fdeac88610dd1362ba8f98428..1aea96a15eb090ccd1a508641e4c6c0a8dcf7fb9 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -24,17 +24,63 @@ class UniqueOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-    OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-
     auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), 1,
-        platform::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
-                                          "But now the dims of Input(X) is %d.",
-                                          in_dims.size()));
+    if (!ctx->Attrs().Get<bool>("is_sorted")) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+      PADDLE_ENFORCE_EQ(in_dims.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The Input(X) should be 1-D Tensor, "
+                            "But now the dims of Input(X) is %d.",
+                            in_dims.size()));
+
+      ctx->SetOutputDim("Out", {-1});
+      ctx->SetOutputDim("Index", in_dims);
+      return;
+    }
+
+    bool return_index = ctx->Attrs().Get<bool>("return_index");
+    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
+    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
+    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
+
+    if (return_index) {
+      OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
+    }
+    if (return_inverse) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+    }
+    if (return_counts) {
+      OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
+    }
 
-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
+    if (axis_vec.empty()) {
+      ctx->SetOutputDim("Out", {-1});
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {framework::product(in_dims)});
+      }
+    } else {
+      int axis = axis_vec[0];
+      if (axis < 0) {
+        axis += in_dims.size();
+      }
+      PADDLE_ENFORCE_LT(
+          axis, in_dims.size(),
+          platform::errors::InvalidArgument("The axis(%d) should be less than "
+                                            "the dimension size(%d) of x.",
+                                            axis, in_dims.size()));
+      auto out_dims = in_dims;
+      out_dims[axis] = -1;
+      ctx->SetOutputDim("Out", out_dims);
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {in_dims[axis]});
+      }
+    }
+    if (return_index) {
+      ctx->SetOutputDim("Indices", {-1});
+    }
+    if (return_counts) {
+      ctx->SetOutputDim("Counts", {-1});
+    }
   }
 
  protected:
@@ -49,14 +95,47 @@ class UniqueOp : public framework::OperatorWithKernel {
 class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
+    AddInput("X",
+             "Input tensor. It should be a 1-D tensor when Attr(is_sorted)"
+             " is fasle or a N-D tensor when Attr(is_sorted) is true.");
     AddAttr<int>("dtype", "data type for output index");
     AddOutput("Out", "A unique subsequence for input tensor.");
     AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and int64 dtype.");
+              "Equivalent to inverse in numpy.unique, "
+              "the indices for where elements in the original input ended up "
+              "in the returned unique tensor.");
+    AddOutput(
+        "Indices",
+        "The indices of the input tensor that result in the unique tensor.")
+        .AsDispensable();
+    AddOutput("Counts", "The counts for each unique element.").AsDispensable();
+    AddAttr<bool>("return_index",
+                  "If True, also return the indices of the input"
+                  " tensor that result in the unique Tensor.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "return_inverse",
+        "If True, also return the indices for where elements"
+        " in the original input ended up in the returned unique tensor.")
+        .SetDefault(false);
+    AddAttr<bool>("return_counts",
+                  "If True, also return the counts for each unique element.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "axis",
+        "The axis to apply unique. If None, the input will be flattened.")
+        .SetDefault({});
+    AddAttr<bool>("is_sorted",
+                  "If True, the unique elements of X are in ascending order."
+                  "Otherwise, the unique elements are not sorted.")
+        .SetDefault(false);
     AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence
+    1. Return a unique subsequence for 1-D input tensor, and an index tensor
+    pointing to this unique subsequence when Attr(is_sorted) is false. This 
+    means paddle.unique is called.
+    
+    2. Returns the unique elements of X in ascending order when Attr(is_sorted)
+    is true. This means fluid.layers.unique is called.
 )DOC");
   }
 };
@@ -65,6 +144,8 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
-REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel<float>,
-                       ops::UniqueKernel<double>, ops::UniqueKernel<int32_t>,
-                       ops::UniqueKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index cdfd797cbfdf87a42ce0834eea9467010a058431..dc8b2ac5555126d8cf2bb92d2f506b1bf358e680 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <cmath>
+#include <numeric>
+#include <set>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
@@ -104,17 +109,243 @@ struct UniqueOpFunctor {
   }
 };
 
+static std::vector<framework::Tensor> Unbind(const framework::Tensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<framework::Tensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T>
+static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
+  if (a.numel() != b.numel()) {
+    return false;
+  }
+  for (int64_t i = 0; i < a.numel(); ++i) {
+    if (a.data<T>()[i] != b.data<T>()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 template <typename T>
+static void UniqueFlattendTensor(const framework::ExecutionContext& context,
+                                 const framework::Tensor& in,
+                                 framework::Tensor* out, bool return_index,
+                                 bool return_inverse, bool return_counts) {
+  const T* in_data = in.data<T>();
+  std::set<T> unique(in_data, in_data + in.numel());
+  out->Resize(framework::make_ddim({static_cast<int64_t>(unique.size())}));
+  auto out_data = out->mutable_data<T>(context.GetPlace());
+  std::copy(unique.begin(), unique.end(), out_data);
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    indices->Resize(framework::make_ddim({out->numel()}));
+    auto indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> indices_map;
+    indices_map.reserve(out->numel());
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      if (indices_map.find(in_data[i]) != indices_map.end()) continue;
+      indices_map[in_data[i]] = i;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      indices_data[i] = indices_map[out_data[i]];
+    }
+  }
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    inverse->Resize(framework::make_ddim({in.numel()}));
+    auto inverse_data = inverse->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> inverse_map;
+    inverse_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      inverse_map[out_data[i]] = i;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      inverse_data[i] = inverse_map[in_data[i]];
+    }
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    count->Resize(framework::make_ddim({out->numel()}));
+    auto count_data = count->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> counts_map;
+    counts_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      counts_map[out_data[i]] = 0;
+    }
+    for (int64_t i = 0; i < in.numel(); i++) {
+      counts_map[in_data[i]] += 1;
+    }
+    for (int64_t i = 0; i < out->numel(); i++) {
+      count_data[i] = counts_map[out_data[i]];
+    }
+  }
+}
+
+template <class ForwardIt, typename T>
+static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
+                               ForwardIt first, ForwardIt last,
+                               const std::vector<int64_t>& sorted_indices_vec,
+                               std::vector<int64_t>* inverse_vec,
+                               std::vector<int64_t>* counts_vec,
+                               std::vector<int64_t>* indices_vec) {
+  if (first == last) {
+    return last;
+  }
+
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+  (*indices_vec)[0] = sorted_indices_vec[0];
+
+  ForwardIt begin = first;
+  ForwardIt result = first;
+
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!Equal<T>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+      (*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+
+template <typename DeviceContext, typename T>
+static void UniqueDim(const framework::ExecutionContext& context,
+                      const framework::Tensor& in, framework::Tensor* out,
+                      bool return_index, bool return_inverse,
+                      bool return_counts, int axis) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(framework::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  framework::Tensor in_trans;
+  framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  in_trans.mutable_data<T>(context.GetPlace());
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+  TransCompute<DeviceContext, T>(in.dims().size(), dev_ctx, in, &in_trans,
+                                 permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  framework::DDim in_trans_flat_dims =
+      framework::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  // sort indices
+  std::vector<int64_t> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const T* in_trans_data = in_trans.data<T>();
+  std::sort(sorted_indices_vec.begin(), sorted_indices_vec.end(),
+            [&](int64_t a, int64_t b) -> bool {
+              for (int64_t i = 0; i < col; ++i) {
+                T lhs = in_trans_data[i + a * col];
+                T rhs = in_trans_data[i + b * col];
+                if (lhs < rhs) {
+                  return true;
+                } else if (lhs > rhs) {
+                  return false;
+                }
+              }
+              return false;
+            });
+
+  // sort tensor according to indices
+  framework::Tensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  input_sorted.mutable_data<T>(context.GetPlace());
+  T* input_sorted_data = input_sorted.data<T>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + sorted_indices_vec[i] * col, col * sizeof(T));
+  }
+
+  std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
+  std::vector<int64_t> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<int64_t> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<int64_t> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, T>(
+      context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec,
+      &inverse_vec, &counts_vec, &indices_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(),
+                    indices_vec.end());
+
+  math::ConcatFunctor<DeviceContext, T> concat_functor;
+  framework::Tensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(framework::make_ddim(out_trans_dims_vec));
+  out_trans.mutable_data<T>(context.GetPlace());
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(framework::make_ddim(out_trans_dims_vec));
+  out->mutable_data<T>(context.GetPlace());
+  concat_functor(dev_ctx, input_unbind, 0, &out_trans);
+  TransCompute<DeviceContext, T>(out_trans.dims().size(), dev_ctx, out_trans,
+                                 out, permute);
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    framework::TensorFromVector(inverse_vec, context.device_context(), inverse);
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    framework::TensorFromVector(counts_vec, context.device_context(), count);
+  }
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    framework::TensorFromVector(indices_vec, context.device_context(), indices);
+  }
+}
+
+template <typename DeviceContext, typename T>
 class UniqueKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
-    auto* index = context.Output<framework::Tensor>("Index");
+    if (!context.Attr<bool>("is_sorted")) {
+      auto data_type = static_cast<framework::proto::VarType::Type>(
+          context.Attr<int>("dtype"));
+      auto* index = context.Output<framework::Tensor>("Index");
+
+      framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+      return;
+    }
 
-    framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+    std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
+    bool return_index = context.Attr<bool>("return_index");
+    bool return_inverse = context.Attr<bool>("return_inverse");
+    bool return_counts = context.Attr<bool>("return_counts");
+
+    if (axis_vec.empty()) {
+      UniqueFlattendTensor<T>(context, *x, out, return_index, return_inverse,
+                              return_counts);
+    } else {
+      int axis = axis_vec[0];
+      UniqueDim<DeviceContext, T>(context, *x, out, return_index,
+                                  return_inverse, return_counts, axis);
+    }
   }
 };
 
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index ec458ee7957c6a496ad4bd5579fe0b3c8a72069d..d7126b958650acfdf10a1b75ec5e8ee51643c008 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -62,6 +62,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
+    {"unique", {"Out", "Index", "Indices", "Counts"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 0b4e4502bb816c4ca293e54d5f78e8e504df2b48..4377a8c2cef5aab7a200955cd25830d448014817 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -564,9 +565,9 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
   if (!is_gpu_tensor && !is_xpu_tensor) {
     if (!need_deep_copy) {
-      return py::array(py::buffer_info(
-          const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
-          static_cast<size_t>(tensor.dims().size()), py_dims, py_strides));
+      auto base = py::cast(std::move(tensor));
+      return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
+                       const_cast<void *>(tensor_buf_ptr), base);
     } else {
       py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 04870f87c40dd305304579a454cb618bf1446e39..1f88eb2109aa23b6b60104451908b0a70c41c898 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,6 +29,8 @@ function(train_test TARGET_NAME)
                 PROPERTIES DEPENDS test_${TARGET_NAME})
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES LABELS "RUN_TYPE=DIST")
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES TIMEOUT 150)
     endforeach()
 endfunction(train_test)
 
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 39db5a601d3d46c106a574870f02434bd4bd5cd1..d7a86b653bec44c260a845d454c771ec4440993b 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -70,7 +70,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
 | `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
 
 ## Docker Images
 
@@ -155,21 +154,6 @@ docker push
 kubectl ...
 ```
 
-### Reading source code with woboq codebrowser
-
-For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
-
-- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
-
-```bash
-./paddle/scripts/paddle_docker_build.sh html
-```
-
-- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
-
-```
-docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
-```
 
 ## More Options
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 962ee53a1069b8cd7863a0f1dca616c939eb237d..57defebd7575b41c031957aa9c2f848861a006ac 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -529,13 +529,16 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2
+        ctest --output-on-failure -j $2;mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         paddle version
         # Recovery proxy to avoid failure in later steps
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
+        if [ "$mactest_error" != 0 ];then
+            exit 8;
+        fi
     fi
 }
 
@@ -1105,22 +1108,6 @@ EOF
       esac
 }
 
-function gen_html() {
-    cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-    export WOBOQ_OUT=${PADDLE_ROOT}/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-    	-b ${PADDLE_ROOT}/build \
-    	-a \
-    	-o $WOBOQ_OUT \
-    	-p paddle:${PADDLE_ROOT}
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-}
 
 function gen_dockerfile() {
     # Set BASE_IMAGE according to env variables
@@ -1443,9 +1430,6 @@ function main() {
       gen_doc_lib)
         gen_doc_lib $2
         ;;
-      html)
-        gen_html
-        ;;
       dockerfile)
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 2971617aa705f55f193e512bf7ef75b609588c02..a337fc41f292521c5e90daffd71b5bd4ff4e0553 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -14,7 +14,7 @@
 
 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
-from paddle.fluid.framework import Variable
+from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format
 
 
@@ -810,6 +810,68 @@ class DistributedStrategy(object):
         else:
             print("WARNING: auto should have value of bool type")
 
+    @property
+    def cudnn_exhaustive_search(self):
+        return self.strategy.cudnn_exhaustive_search
+
+    @cudnn_exhaustive_search.setter
+    def cudnn_exhaustive_search(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_exhaustive_search = flag
+        else:
+            print(
+                "WARNING: cudnn_exhaustive_search should have value of bool type"
+            )
+
+    @property
+    def conv_workspace_size_limit(self):
+        return self.strategy.conv_workspace_size_limit
+
+    @conv_workspace_size_limit.setter
+    def conv_workspace_size_limit(self, value):
+        if isinstance(value, int):
+            self.strategy.conv_workspace_size_limit = value
+        else:
+            print(
+                "WARNING: conv_workspace_size_limit should have value of int type"
+            )
+
+    @property
+    def cudnn_batchnorm_spatial_persistent(self):
+        return self.strategy.cudnn_batchnorm_spatial_persistent
+
+    @cudnn_batchnorm_spatial_persistent.setter
+    def cudnn_batchnorm_spatial_persistent(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_batchnorm_spatial_persistent = flag
+        else:
+            print(
+                "WARNING: cudnn_batchnorm_spatial_persistent should have value of bool type"
+            )
+
+    def _enable_env(self):
+        strategy = self.strategy
+        keys = [
+            "FLAGS_cudnn_batchnorm_spatial_persistent",
+            "FLAGS_conv_workspace_size_limit",
+            "FLAGS_cudnn_exhaustive_search",
+            "FLAGS_sync_nccl_allreduce",
+            "FLAGS_fuse_parameter_memory_size",
+            "FLAGS_fuse_parameter_groups_size",
+        ]
+        values = [
+            bool(strategy.cudnn_batchnorm_spatial_persistent),
+            int(strategy.conv_workspace_size_limit),
+            bool(strategy.cudnn_exhaustive_search),
+            bool(strategy.sync_nccl_allreduce),
+            int(strategy.fuse_grad_size_in_MB),
+            int(strategy.fuse_grad_size_in_TFLOPS),
+        ]
+
+        for i, key in enumerate(keys):
+            if core.globals().is_public(key):
+                core.globals()[key] = values[i]
+
     def __repr__(self):
         fields = self.strategy.DESCRIPTOR.fields
         for f in fields:
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index c8ae8df52066ef9498996c1094cfa01c3f27b615..a6286bcca87fad1afddbd8af1e56dad05dab2578 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -383,6 +383,7 @@ class Fleet(object):
         context["valid_strategy"] = valid_strategy
 
         self.valid_strategy = valid_strategy
+        self.valid_strategy._enable_env()
 
         optimize_ops = []
         params_grads = []
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index a5f2f9421329cd97d03efa3507120203fcdc747b..f0943b72add6a7a37b5da45853bdddc8c5224a3e 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -271,6 +271,6 @@ endforeach()
 
 # setting timeout value for old unittests
 if(NOT WIN32)
-	set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 200)
+	set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250)
 	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200)
 endif()
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 337d2dfc008e8298925bb3febdc2007736e4a1b6..5a291df4700ad1336e8e850bd3674b1a8d9df979 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -132,13 +132,16 @@ def _declarative_(dygraph_func):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
     @declarative handles the Program and Executor of static mode and returns
-    the result as a dygraph VarBase.
+    the result as dygraph Tensor(s). Users could use the returned dygraph
+    Tensor(s) to do imperative training, inference, or other operations. If the
+    decorated function calls other imperative function, the called one will be
+    converted into declarative function as well.
 
     Args:
         dygraph_func (callable): callable imperative function.
 
     Returns:
-        VarBase: containing the numerical result.
+        Tensor(s): containing the numerical result.
 
     Examples:
         .. code-block:: python
@@ -147,6 +150,7 @@ def _declarative_(dygraph_func):
           import numpy as np
           from paddle.fluid.dygraph.jit import declarative
 
+          fluid.enable_dygraph()
 
           @declarative
           def func(x):
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 52cfd9bf0a3e2d11b7e3b892d0a8d0d973b33ce4..2e3f34f41648a9343b4bccd1044bcd3f7b3d8189 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -110,7 +110,7 @@ def scope_guard(scope):
         _switch_scope(ex)
 
 
-def as_numpy(tensor):
+def as_numpy(tensor, copy=False):
     """
     Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
     For higher dimensional sequence data, please use LoDTensor directly.
@@ -129,6 +129,7 @@ def as_numpy(tensor):
 
     Args:
        tensor(Variable): a instance of Tensor
+       copy(bool, optional): Whether to use deep copy.
 
     Returns:
         numpy.ndarray
@@ -145,7 +146,10 @@ def as_numpy(tensor):
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
     if tensor._is_initialized():
-        return np.array(tensor)
+        if copy:
+            return np.array(tensor)
+        else:
+            return np.asarray(tensor)
     else:
         return None
 
@@ -350,7 +354,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
         " program.")
     tensor = var.get_tensor()
     if return_numpy:
-        tensor = as_numpy(tensor)
+        tensor = as_numpy(tensor, copy=True)
     return tensor
 
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 10adb63c9cb1a4395d4df3f90b36c86da4a1e318..5753cca521a4941e5d5c77966073acd43508b8c7 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -522,9 +522,15 @@ if(NOT WIN32)
 endif()
 
 if(NOT APPLE AND NOT WIN32)
-    parallel_bash_test_modules(test_acp START_BASH parallel_test.sh TIMEOUT 140 UnitTests test_auto_checkpoint test_auto_checkpoint1 test_auto_checkpoint2 test_auto_checkpoint3)
-    parallel_bash_test_modules(test_acp2 START_BASH parallel_test.sh TIMEOUT 140 UnitTests test_auto_checkpoint_multiple test_auto_checkpoint_dist_basic)
-    parallel_bash_test_modules(test_hdfs START_BASH parallel_test.sh TIMEOUT 120 UnitTests test_hdfs1 test_hdfs2 test_hdfs3)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
 endif()
 
 add_subdirectory(sequence)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3c6e7d0492b2f4a98a595f015e3b9f4a19e24
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestBatchNorm(unittest.TestCase):
+    def test_name(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                batch_norm1d = paddle.nn.BatchNorm1d(1, name="test")
+
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            #paddle.disable_static()
+            x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+            x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2d(1)
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3d(1)
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 40e0168e1ac93dfd93a99c19eced05756a49471f..8d715674cc6c9ba4f8b5c1ff4fe0cbdbe7841643 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -294,6 +294,28 @@ class TestStrategyConfig(unittest.TestCase):
         with self.assertRaises(TypeError):
             strategy.unknown_key = 'UNK'
 
+    def test_cudnn_exhaustive_search(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_exhaustive_search = False
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+        strategy.cudnn_exhaustive_search = "True"
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+
+    def test_cudnn_batchnorm_spatial_persistent(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_batchnorm_spatial_persistent = False
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+        strategy.cudnn_batchnorm_spatial_persistent = "True"
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+
+    def test_conv_workspace_size_limit(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.conv_workspace_size_limit = 1000
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy.conv_workspace_size_limit = "400"
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy._enable_env()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..654e8d6f129e1ffe0dce59113ca88a16d348f210
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphGroupNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02ba1a584b52dbbc99fcc8ed7bad438e7a9dd46
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestInstanceNorm(unittest.TestCase):
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm1d = paddle.nn.InstanceNorm1d(1)
+                instance_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                instance_norm2d = paddle.nn.InstanceNorm2d(1)
+                instance_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(1)
+                instance_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.InstanceNorm(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.InstanceNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ins = fluid.dygraph.InstanceNorm(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ins = paddle.nn.InstanceNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f324e4bd377c616fb14b2b6df2b936b04ed76ff5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphLayerNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
old mode 100644
new mode 100755
index 32d0f7b63b8fca495f1973d8831fdd9239cfc9f1..dbf167617a24f36a36aff52d996a50ca3ebb6672
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -44,8 +44,8 @@ class TestMultiplyAPI(unittest.TestCase):
 
     def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
         paddle.disable_static()
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         res = paddle.multiply(x, y, axis=axis)
         return res.numpy()
 
@@ -126,17 +126,31 @@ class TestMultiplyError(unittest.TestCase):
         paddle.disable_static()
         x_data = np.random.randn(200).astype(np.int8)
         y_data = np.random.randn(200).astype(np.int8)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
 
         # test dynamic computation graph: inputs must be broadcastable
         x_data = np.random.rand(200, 5)
         y_data = np.random.rand(200)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
 
+        # test dynamic computation graph: inputs must be broadcastable(python)
+        x_data = np.random.rand(200, 5)
+        y_data = np.random.rand(200)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+
+        # test dynamic computation graph: dtype must be same
+        x_data = np.random.randn(200).astype(np.int64)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(TypeError, paddle.multiply, x, y)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..54e7765c0fb76844a6123fceea6c1ef79dc0c2bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -0,0 +1,244 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+
+def numpy_topk(x, k=1, axis=-1, largest=True):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if largest:
+        indices = np.argsort(-x, axis=axis)
+    else:
+        indices = np.argsort(x, axis=axis)
+    if largest:
+        value = -np.sort(-x, axis=axis)
+    else:
+        value = np.sort(x, axis=axis)
+    indices = indices.take(indices=range(0, k), axis=axis)
+    value = value.take(indices=range(0, k), axis=axis)
+    return value, indices
+
+
+class TestTopkOp(OpTest):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 20)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestTopOp1(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = True
+
+
+class TestTopOp2(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp3(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp4(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkOp5(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp6(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopKAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.input_data = np.random.rand(6, 7, 8)
+        self.large_input_data = np.random.rand(2, 1030)
+
+    def run_dygraph(self, place):
+        paddle.disable_static(place)
+        input_tensor = paddle.to_tensor(self.input_data)
+        large_input_tensor = paddle.to_tensor(self.large_input_data)
+        # test case for basic test case 1
+        paddle_result = paddle.topk(input_tensor, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 2 with axis
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 3 with tensor K
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 4 with tensor largest
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 5 with axis -1
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 6 for the partial sort 
+        paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
+        numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 7 for the unsorted 
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+        sort_paddle = numpy_topk(
+            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 7, 8], dtype="float64")
+            large_input_tensor = paddle.static.data(
+                name="large_x", shape=[2, 1030], dtype="float64")
+            k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
+            result1 = paddle.topk(input_tensor, k=2)
+            result2 = paddle.topk(input_tensor, k=2, axis=-1)
+            result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+            result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+            result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
+            result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(10, 20).astype("float64")
+            large_input_data = np.random.rand(2, 100).astype("float64")
+            paddle_result = exe.run(
+                feed={
+                    "x": self.input_data,
+                    "large_x": self.large_input_data,
+                    "k": np.array([2]).astype("int32")
+                },
+                fetch_list=[
+                    result1[0], result1[1], result2[0], result2[1], result3[0],
+                    result3[1], result4[0], result4[1], result5[0], result5[1],
+                    result6[0], result6[1], result7[0], result7[1]
+                ])
+            numpy_result = numpy_topk(self.input_data, k=2)
+            self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[2], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[3], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=-1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
+            numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[10], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[11], numpy_result[1]))
+            sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2)
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def test_cases(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.run_dygraph(place)
+            self.run_static(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index 65194524adfcd7f48efe0f2378c95dba7a8c4a5e..ae36f8a9861560b51a78617d1b66aa7743b87580 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -125,5 +126,164 @@ class TestRandomGPU(TestUniqueOp):
             self.check_output_with_place(place, atol=1e-5)
 
 
+class TestSortedUniqueOp(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64')}
+        unique, indices, inverse, count = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": count,
+        }
+
+
+class TestUniqueOpAxisNone(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((4, 7, 10)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueOpAxis1(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((3, 8, 8)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=1)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": [1],
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueAPI(unittest.TestCase):
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        x_data = x_data = np.random.randint(0, 10, (120))
+        x = paddle.to_tensor(x_data)
+        out = paddle.unique(x)
+        expected_out = np.unique(x_data)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        paddle.enable_static()
+
+    def test_dygraph_api_attr(self):
+        paddle.disable_static()
+        x_data = np.random.random((3, 5, 5)).astype("float32")
+        x = paddle.to_tensor(x_data)
+        out, index, inverse, counts = paddle.unique(
+            x,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        np_out, np_index, np_inverse, np_counts = np.unique(
+            x_data,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        self.assertTrue((out.numpy() == np_out).all(), True)
+        self.assertTrue((index.numpy() == np_index).all(), True)
+        self.assertTrue((inverse.numpy() == np_inverse).all(), True)
+        self.assertTrue((counts.numpy() == np_counts).all(), True)
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data(name='x', shape=[3, 2], dtype='float64')
+            unique, inverse, counts = paddle.unique(
+                x, return_inverse=True, return_counts=True, axis=0)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [1, 2]]).astype('float64')
+            result = exe.run(feed={"x": x_np},
+                             fetch_list=[unique, inverse, counts])
+        np_unique, np_inverse, np_counts = np.unique(
+            x_np, return_inverse=True, return_counts=True, axis=0)
+        self.assertTrue(np.allclose(result[0], np_unique))
+        self.assertTrue(np.allclose(result[1], np_inverse))
+        self.assertTrue(np.allclose(result[2], np_counts))
+
+
+class TestUniqueError(unittest.TestCase):
+    def test_input_dtype(self):
+        def test_x_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                result = paddle.unique(x)
+
+            self.assertRaises(TypeError, test_x_dtype)
+
+    def test_attr(self):
+        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+
+        def test_return_index():
+            result = paddle.unique(x, return_index=0)
+
+        self.assertRaises(TypeError, test_return_index)
+
+        def test_return_inverse():
+            result = paddle.unique(x, return_inverse='s')
+
+        self.assertRaises(TypeError, test_return_inverse)
+
+        def test_return_counts():
+            result = paddle.unique(x, return_counts=3)
+
+        self.assertRaises(TypeError, test_return_counts)
+
+        def test_axis():
+            result = paddle.unique(x, axis='12')
+
+        self.assertRaises(TypeError, test_axis)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/incubate/hapi/model.py
index c12df569790f7cd914da6fdaa4a6a84ac6bfd4d5..977f9233a954746486afee114b2e87a25096ed38 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,17 +26,22 @@ from collections import Iterable
 
 import paddle
 from paddle import fluid
+from paddle.fluid import core
+from paddle.fluid.framework import in_dygraph_mode, Variable, ParamBase, _current_expected_place
 # Note: Use alias `Input` temporarily before releasing hapi feature.
 from paddle.static import InputSpec as Input
-from paddle.fluid.framework import in_dygraph_mode, Variable
 from paddle.fluid.executor import global_scope
 from paddle.fluid.io import is_belong_to_optimizer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
+from paddle.fluid.executor import scope_guard, Executor
 from paddle.io import DataLoader, Dataset
+
+from paddle.fluid.dygraph.layers import Layer
 from paddle.metric import Metric
 
 from .distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
@@ -846,24 +851,32 @@ class Model(object):
         """
         return self._adapter.test_batch(inputs)
 
-    def save(self, path):
-        """
-        This function saves parameters, optimizer infomation to path.
+    def save(self, path, training=True):
+        """  
+        This function saves parameters, optimizer information or model and 
+        paramters only for inference to path. It depends on the parameter
+        `training`.
 
-        The parameters contains all the trainable Variable, will save to
-        a file with suffix ".pdparams".
+        If `training` is set to True, the parameters saved contain all 
+        the trainable Variable, will save to a file with suffix ".pdparams".
         The optimizer information contains all the variable used by optimizer.
         For Adam optimizer, contains beta1, beta2, momentum etc. All the
         information will save to a file with suffix ".pdopt". (If the optimizer
         have no variable need to save (like SGD), the fill will not generated).
+        This function will silently overwrite existing file at the target location.
 
-        This function will silently overwrite existing file
-        at the target location.
+        If `training` is set to False, only inference model will be saved. It 
+        should be noted that before using `save`, you should run the model, and 
+        the shape of input you saved is as same as the input of its running.
+        `@paddle.jit.to_static` must be added on `forward` function of your layer 
+        in dynamic mode now and these will be optimized later.
 
         Args:
             path (str): The file prefix to save model. The format is
                 'dirname/file_prefix' or 'file_prefix'. if empty str. A exception
                  will be raised.
+            training (bool, optional): Whether to save for training. If not, save
+                for inference only. Default: True.
 
         Returns:
             None
@@ -871,25 +884,47 @@ class Model(object):
         Examples:
 
             .. code-block:: python
-            
-              import paddle
-              import paddle.incubate.hapi as hapi
-              
-              class MyNet(paddle.nn.Layer):
-                  def __init__(self):
-                      super(MyNet, self).__init__()
-                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
+                import paddle
+                import paddle.incubate.hapi as hapi
+                from paddle.nn import Linear
+                from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+                class Mnist(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyNet, self).__init__()
+                        self._fc = Linear(784, 1, act='softmax')
+
+                  @paddle.jit.to_static # If save for inference in dygraph, need this
                   def forward(self, x):
                       y = self._fc(x)
                       return y
-              
-              device = hapi.set_device('cpu')
-              paddle.disable_static(device)
-              model = hapi.Model(MyNet())
-              model.save('checkpoint/test')
+
+                dynamic = True # False
+                device = hapi.set_device('cpu')
+                # if use static graph, do not set
+                paddle.disable_static(device) if dynamic else None
+
+                # inputs and labels are not required for dynamic graph.
+                input = hapi.Input([None, 784], 'float32', 'x')
+                label = hapi.Input([None, 1], 'int64', 'label')
+
+                model = hapi.Model(Mnist(), input, label)
+                optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                    parameter_list=model.parameters())
+                model.prepare(optim,
+                                paddle.nn.CrossEntropyLoss(),
+                                hapi.metrics.Accuracy())
+                mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+                model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
+                model.save('checkpoint/test') # save for training
+                model.save('inference_model', False) # save for inference
         """
+
         if ParallelEnv().local_rank == 0:
-            self._adapter.save(path)
+            if not training:
+                self._save_inference_model(path)
+            else:
+                self._adapter.save(path)
 
     def load(self, path, skip_mismatch=False, reset_optimizer=False):
         """
@@ -1474,13 +1509,17 @@ class Model(object):
         cbks.on_end('test', logs)
         return outputs
 
-    def save_inference_model(self,
-                             save_dir,
-                             model_filename=None,
-                             params_filename=None,
-                             model_only=False):
+    def _save_inference_model(self,
+                              save_dir,
+                              model_filename=None,
+                              params_filename=None,
+                              model_only=False):
         """
-        Save inference model must in static mode.
+        Save inference model can be in static or dynamic mode.
+        It should be noted that before using `save_inference_model`, you should
+        run the model, and the shape you saved is as same as the input of its
+        running. `@paddle.jit.to_static` must be added on `forward` function of
+        your layer in dynamic mode now and these will be optimized later.
 
         Args:
             save_dir (str): The directory path to save the inference model.
@@ -1496,39 +1535,145 @@ class Model(object):
         Returns:
             list: The fetch variables' name list
 
-
         Examples:
         .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.static import InputSpec
 
             import paddle.incubate.hapi as hapi
-
-            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
-            model = hapi.Model(hapi.vision.LeNet(), input)
-            model.prepare()
-
+            from paddle.nn import Linear
+            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+            class Mnist(Layer):
+                def __init__(self, classifier_act=None):
+                    super(Mnist, self).__init__()
+
+                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
+
+                @paddle.jit.to_static # In static mode, you need to delete this.
+                def forward(self, inputs):
+                    outputs = self.fc(inputs)
+                    return outputs
+
+            dynamic = True # False
+            device = hapi.set_device('gpu')
+
+            # if use static graph, do not set
+            paddle.disable_static(device) if dynamic else None
+
+            # inputs and labels are not required for dynamic graph.
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
+
+            model = hapi.Model(Mnist(), input, label)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                parameter_list=model.parameters())
+            model.prepare(optim,
+                            paddle.nn.CrossEntropyLoss(),
+                            hapi.metrics.Accuracy())
+            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
             model.save_inference_model('inference_model')
         """
-        assert not fluid.in_dygraph_mode(
-        ), 'Save inference model must in static mode!'
 
-        prog = self._adapter._progs.get('test', None)
-        assert prog, \
-            "Model is not ready, please call `model.prepare()` first"
+        def get_inout_spec(all_vars, return_name=False):
+            result_list = []
+            valid_vars = [var for var in all_vars if isinstance(var, Variable)]
+            result_list = valid_vars
+            if return_name:
+                result_list = [var.name for var in result_list]
 
-        infer_prog = prog.clone(for_test=True)
+            return result_list
 
-        input_names = [v.name for v in self._adapter._input_vars['test']]
-        endpoints = self._adapter._endpoints['test']['output']
+        # TODO:
+        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
+        # 2. Save correct shape of input, now the interface stores the shape that the user sent to 
+        #    the inputs of the model in running.
+        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
+        if fluid.in_dygraph_mode():
+            layer = self.network
+            fluid.disable_dygraph()
+
+            # 1. input check
+            prog_translator = ProgramTranslator()
+            if not prog_translator.enable_declarative:
+                raise RuntimeError(
+                    "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                )
+            if not isinstance(layer, Layer):
+                raise TypeError(
+                    "The input layer should be 'Layer', but received layer type is %s."
+                    % type(layer))
+
+            # 2. get program of declarative Layer.forward
+            prog_cache = prog_translator.get_program_cache()
+            # make dummy args & kwargs, to get excepted FunctionSpec
+            layer_func = FunctionSpec(type(layer).forward, [layer], {})
+            concrete_program, _ = prog_cache.get_program(layer_func)
+
+            # NOTE: we maintain the mapping of variable name to
+            # structured name, the buffer variable (non-persistable)
+            # saved to inference program may not need by dygraph Layer,
+            # we only record the state_dict variable's structured name
+            state_names_dict = dict()
+            for structured_name, var in layer.state_dict().items():
+                state_names_dict[var.name] = structured_name
+
+            # 3. share parameters from Layer to scope & record var info
+            scope = core.Scope()
+            extra_var_info = dict()
+            for param_or_buffer in concrete_program.parameters:
+                # share to scope
+                param_or_buffer_tensor = scope.var(
+                    param_or_buffer.name).get_tensor()
+                src_tensor = param_or_buffer.value().get_tensor()
+                param_or_buffer_tensor._share_data_with(src_tensor)
+                # record var info
+                extra_info_dict = dict()
+                if param_or_buffer.name in state_names_dict:
+                    extra_info_dict['structured_name'] = state_names_dict[
+                        param_or_buffer.name]
+                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
+                if isinstance(param_or_buffer, ParamBase):
+                    extra_info_dict['trainable'] = param_or_buffer.trainable
+                extra_var_info[param_or_buffer.name] = extra_info_dict
+
+            # 4. build input & output spec
+            input_var_names = get_inout_spec(concrete_program.inputs, True)
+            output_vars = get_inout_spec(concrete_program.outputs)
+
+            # 5. save inference model
+            with scope_guard(scope):
+                return fluid.io.save_inference_model(
+                    dirname=save_dir,
+                    feeded_var_names=input_var_names,
+                    target_vars=output_vars,
+                    executor=Executor(_current_expected_place()),
+                    main_program=concrete_program.main_program.clone(),
+                    model_filename=model_filename,
+                    params_filename=params_filename,
+                    program_only=model_only)
 
-        return fluid.io.save_inference_model(
-            save_dir,
-            input_names,
-            endpoints,
-            self._adapter._executor,
-            main_program=infer_prog,
-            model_filename=model_filename,
-            params_filename=params_filename,
-            program_only=model_only)
+        else:
+            prog = self._adapter._progs.get('test', None)
+            assert prog, \
+                "Model is not ready, please call `model.prepare()` first"
+
+            infer_prog = prog.clone(for_test=True)
+
+            input_names = [v.name for v in self._adapter._input_vars['test']]
+            endpoints = self._adapter._endpoints['test']['output']
+
+            return fluid.io.save_inference_model(
+                save_dir,
+                input_names,
+                endpoints,
+                self._adapter._executor,
+                main_program=infer_prog,
+                model_filename=model_filename,
+                params_filename=params_filename,
+                program_only=model_only)
 
     def _run_one_epoch(self, data_loader, callbacks, mode, logs={}):
         outputs = []
diff --git a/python/paddle/incubate/hapi/tests/CMakeLists.txt b/python/paddle/incubate/hapi/tests/CMakeLists.txt
index 5cad495de7c88781de50de9b2bbe1a765a45582f..8ffcd67443f1c8722560da20d9cfb76b18a67351 100644
--- a/python/paddle/incubate/hapi/tests/CMakeLists.txt
+++ b/python/paddle/incubate/hapi/tests/CMakeLists.txt
@@ -12,6 +12,7 @@ endforeach()
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 150) 
 
 
 function(py_dist_test TARGET_NAME)
diff --git a/python/paddle/incubate/hapi/tests/test_model.py b/python/paddle/incubate/hapi/tests/test_model.py
index af17c5636426cb6ed7af00f3b483bd286b9f03b8..96c432e1bfd8f3620c705be62c4e9d90c2709fa5 100644
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
@@ -33,6 +33,8 @@ from paddle.metric import Accuracy
 from paddle.incubate.hapi.datasets import MNIST
 from paddle.incubate.hapi.vision.models import LeNet
 from paddle.incubate.hapi.distributed import DistributedBatchSampler, prepare_distributed_context
+from paddle.fluid.dygraph.jit import declarative
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
 
 class LeNetDygraph(fluid.dygraph.Layer):
@@ -65,6 +67,37 @@ class LeNetDygraph(fluid.dygraph.Layer):
         return x
 
 
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation=None):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120),
+                Linear(120, 84),
+                Linear(
+                    84, 10, act=classifier_activation))
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -335,7 +368,6 @@ class TestModelFunction(unittest.TestCase):
             model = Model(net, inputs, labels)
             model.prepare(optim2, loss=CrossEntropyLoss(reduction="sum"))
             loss, = model.train_batch([data], [label])
-
             np.testing.assert_allclose(loss.flatten(), ref.flatten())
             fluid.disable_dygraph() if dynamic else None
 
@@ -445,33 +477,38 @@ class TestModelFunction(unittest.TestCase):
             fluid.disable_dygraph() if dynamic else None
 
     def test_export_deploy_model(self):
-        net = LeNet()
-        inputs = [Input([-1, 1, 28, 28], 'float32', 'image')]
-        model = Model(net, inputs)
-        model.prepare()
-        save_dir = tempfile.mkdtemp()
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        tensor_img = np.array(
-            np.random.random((1, 1, 28, 28)), dtype=np.float32)
-        ori_results = model.test_batch(tensor_img)
-
-        model.save_inference_model(save_dir)
-
-        place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=save_dir, executor=exe))
-
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
+        for dynamic in [True, False]:
+            fluid.enable_dygraph() if dynamic else None
+            # paddle.disable_static() if dynamic else None
+            prog_translator = ProgramTranslator()
+            prog_translator.enable(False) if not dynamic else None
+            net = LeNetDeclarative()
+            inputs = [Input([None, 1, 28, 28], 'float32', 'x')]
+            model = Model(net, inputs)
+            model.prepare()
+            save_dir = tempfile.mkdtemp()
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            tensor_img = np.array(
+                np.random.random((1, 1, 28, 28)), dtype=np.float32)
+            ori_results = model.test_batch(tensor_img)
+            model.save(save_dir, training=False)
+            fluid.disable_dygraph() if dynamic else None
 
-        np.testing.assert_allclose(results, ori_results, rtol=1e-6)
-        shutil.rmtree(save_dir)
+            place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            new_scope = fluid.Scope()
+            with fluid.scope_guard(new_scope):
+                exe = fluid.Executor(place)
+                [inference_program, feed_target_names, fetch_targets] = (
+                    fluid.io.load_inference_model(
+                        dirname=save_dir, executor=exe))
+                results = exe.run(inference_program,
+                                  feed={feed_target_names[0]: tensor_img},
+                                  fetch_list=fetch_targets)
+                np.testing.assert_allclose(
+                    results, ori_results, rtol=1e-5, atol=1e-7)
+                shutil.rmtree(save_dir)
 
 
 class TestRaiseError(unittest.TestCase):
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index dd5d0d269a6cb6377b11b6c98e86eef4ee0f8b57..07b3f0d284dcd28d4967131ab85bb2ca3cd1d6da 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -128,6 +128,12 @@ from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm1d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm2d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index afc1614732d06dcef4ca0e1e75cd93e28d6a2d3d..97a4d5432bdc24912a851741328516e9269a64c2 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -160,12 +160,12 @@ from .loss import square_error_cost  #DEFINE_ALIAS
 from .loss import ssd_loss  #DEFINE_ALIAS
 from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 from .loss import ctc_loss  #DEFINE_ALIAS
-# from .norm import batch_norm        #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
 # from .norm import group_norm        #DEFINE_ALIAS
-# from .norm import instance_norm        #DEFINE_ALIAS
 from .norm import l2_normalize  #DEFINE_ALIAS
-# from .norm import layer_norm        #DEFINE_ALIAS
+from .norm import batch_norm  #DEFINE_ALIAS
+from .norm import instance_norm  #DEFINE_ALIAS
+from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 0b007041b4ab336ae355f5d338a0d7dca9b5380e..13e86e5712a1cd5c014517e37d3803ca24cfb6fb 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -18,16 +18,19 @@ import paddle.fluid as fluid
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, core
+from ...framework import create_parameter
 from ...fluid.layers import l2_normalize  #DEFINE_ALIAS
 from ...fluid.layers import lrn  #DEFINE_ALIAS
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid import core, dygraph_utils
 
 __all__ = [
-    #       'batch_norm',
+    'batch_norm',
     #       'data_norm',
-    #       'group_norm',
-    #       'instance_norm',
+    'instance_norm',
     'l2_normalize',
-    #       'layer_norm',
+    'layer_norm',
     'lrn',
     'normalize',
     #       'spectral_norm'
@@ -110,3 +113,286 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     eps = out.block.create_var(dtype=out.dtype)
     paddle.fill_constant([1], out.dtype, epsilon, out=eps)
     return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name)
+
+
+def batch_norm(x,
+               running_mean,
+               running_var,
+               weight,
+               bias,
+               training=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               data_format="NCHW",
+               name=None):
+    """
+    Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
+    
+    Parameters:
+        x(Tesnor): input value. It's data type should be float32, float64.
+        running_mean(Tensor): running mean.
+        running_var(Tensor): running variance.
+        weight(Tensor): The weight tensor of batch_norm, can not be None.
+        bias(Tensor): The bias tensor of batch_norm can not be None. 
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          x = np.random.seed(123)
+          x = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          running_mean = np.random.random(size=1).astype('float32')
+          running_variance = np.random.random(size=1).astype('float32')
+          weight_data = np.random.random(size=1).astype('float32')
+          bias_data = np.random.random(size=1).astype('float32')
+          x = paddle.to_tensor(x)
+          rm = paddle.to_tensor(running_mean)
+          rv = paddle.to_tensor(running_variance)
+          w = paddle.to_tensor(weight_data)
+          b = paddle.to_tensor(bias_data)
+          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
+          print batch_norm_out
+    """
+
+    assert len(x.shape) >= 2, "input dim must be larger than 1"
+
+    # we use not training means use_global_status, more details see nn._BatchNormBase
+    use_global_stats = not training
+    # input ad out must share the memory
+    mean_out = running_mean
+    variance_out = running_var
+
+    if in_dygraph_mode():
+        # for dygraph need tuple
+        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
+                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
+                 "use_global_stats", use_global_stats)
+        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+            x, weight, bias, running_mean, running_var, mean_out, variance_out,
+            *attrs)
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'BatchNorm')
+
+    # for static need dict
+    attrs = {
+        "momentum": momentum,
+        "epsilon": epsilon,
+        "data_layout": data_format,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+
+    inputs = {
+        "X": [x],
+        "Scale": [weight],
+        "Bias": [bias],
+        "Mean": [running_mean],
+        "Variance": [running_var]
+    }
+
+    helper = LayerHelper('batch_norm', **locals())
+
+    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {
+        "Y": [batch_norm_out],
+        "MeanOut": [running_mean],
+        "VarianceOut": [running_var],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return helper.append_activation(batch_norm_out)
+
+
+def layer_norm(x,
+               normalized_shape,
+               weight=None,
+               bias=None,
+               epsilon=1e-05,
+               name=None):
+    """
+    see more detail in paddle.nn.LayerNorm
+    
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+    input_shape = list(x.shape)
+    input_ndim = len(input_shape)
+    normalized_ndim = len(normalized_shape)
+    begin_norm_axis = input_ndim - normalized_ndim
+    if input_ndim < normalized_ndim or input_shape[
+            begin_norm_axis:] != normalized_shape:
+        str_normalized_shape = str(normalized_shape)
+        raise ValueError('Given normalized_shape is ' + str_normalized_shape +
+                         ', expected input with shape [*, ' +
+                         str_normalized_shape[
+                             1:] + ', but got input shape ' + str(input_shape))
+
+    if in_dygraph_mode():
+        pre_act, _, _ = core.ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
+                                            'begin_norm_axis', begin_norm_axis)
+        return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm')
+
+    inputs = dict()
+    inputs['X'] = [x]
+    if weight:
+        inputs['Scale'] = [weight]
+    if bias:
+        inputs['Bias'] = [bias]
+    attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
+
+    # create output
+    helper = LayerHelper('layer_norm', **locals())
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(x.type)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
+def instance_norm(x,
+                  running_mean=None,
+                  running_var=None,
+                  weight=None,
+                  bias=None,
+                  use_input_stats=True,
+                  momentum=0.9,
+                  eps=1e-05,
+                  data_format="NCHW",
+                  name=None):
+    """
+    See more detail in nn.layer.InstanceNorm2d.
+
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        running_mean(Tensor): running mean. Default None.
+        running_var(Tensor): running variance. Default None.
+        weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
+        eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        use_input_stats(bool): Default True.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm_out = paddle.nn.functional.instancenorm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    if in_dygraph_mode():
+        out, _, _ = core.ops.instance_norm(x, weight, bias, "epsilon", eps,
+                                           "momentum", momentum, "data_format",
+                                           data_format)
+        return out
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
+
+    attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format}
+
+    if weight and bias:
+        inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
+    else:
+        inputs = {"X": [x]}
+
+    helper = LayerHelper('instance_norm', **locals())
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
+
+    outputs = {
+        "Y": [instance_norm_out],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    return instance_norm_out
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index bb0bd5f70f1f9006285a3fb200537d35b4cf6c30..d30547ffdbe357e5524573df631d2fe02a35eb37 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -41,6 +41,7 @@ from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.param_attr import ParamAttr
 from ...fluid.initializer import Constant
+from paddle.framework import get_default_dtype
 from .. import functional as F
 
 
@@ -423,7 +424,7 @@ class PReLU(layers.Layer):
             For more information, please refer to :ref:`api_guide_Name`.
     
     Shape:
-        - input: Tensor with any shape.
+        - input: Tensor with any shape. Default dtype is float32.
         - output: Tensor with the same shape as input.
     
     Examples:
@@ -433,13 +434,14 @@ class PReLU(layers.Layer):
             import numpy as np
 
             paddle.disable_static()
+            paddle.set_default_dtype("float64")
 
             data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
                             [ 3.0, -4.0,  5.0, -6.0],
                             [-7.0, -8.0,  8.0,  9.0]],
                             [[ 1.0, -2.0, -3.0,  4.0],
                             [-5.0,  6.0,  7.0, -8.0],
-                            [ 6.0,  7.0,  8.0,  9.0]]]], 'float32')
+                            [ 6.0,  7.0,  8.0,  9.0]]]], 'float64')
             x = paddle.to_tensor(data)
             m = paddle.nn.PReLU(1, 0.25)
             out = m(x)
@@ -461,10 +463,10 @@ class PReLU(layers.Layer):
 
         self._weight = self.create_parameter(
             attr=self._weight_attr,
-            shape=[num_parameters],
-            dtype='float32',
+            shape=[self._num_parameters],
+            dtype=get_default_dtype(),
             is_bias=False,
-            default_initializer=Constant(init))
+            default_initializer=Constant(self._init))
 
     def forward(self, x):
         return F.prelu(x, self._weight)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 369d462a8089a30e6b749ef472aad66166cb590d..c7855b23bf6e6861326533e3cc93d7f7c5bd4ca2 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -1,4 +1,17 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,28 +27,877 @@
 
 # TODO: define normalization api  
 
-import warnings
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
+#from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+
+#from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 
 from ...fluid.dygraph import layers
+
+from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
 
 from ...fluid.initializer import Constant
 from ...fluid.param_attr import ParamAttr
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
-from ...fluid import core
+from ...fluid import core, dygraph_utils
+
+from ..functional import batch_norm, layer_norm, instance_norm
+
+import numpy as np
+import numbers
+import warnings
 
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
-    'SyncBatchNorm'
+    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
+    'InstanceNorm2d', 'InstanceNorm3d', 'SyncBatchNorm'
 ]
 
 
+class _InstanceNormBase(layers.Layer):
+    """
+    This class is based class for InstanceNorm1d, 2d, 3d. 
+
+    See InstaceNorm1d, InstanceNorm2d or InstanceNorm3d for more details.
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-5,
+                 momentum=0.9,
+                 weight_attr=None,
+                 bias_attr=None,
+                 track_running_stats=False,
+                 data_format="NCHW",
+                 name=None):
+        super(_InstanceNormBase, self).__init__()
+
+        if weight_attr == False or bias_attr == False:
+            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if weight_attr != False and bias_attr != False:
+            self.scale = self.create_parameter(
+                attr=self._weight_attr,
+                shape=[num_features],
+                default_initializer=Constant(1.0),
+                is_bias=False)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_features],
+                default_initializer=Constant(0.0),
+                is_bias=True)
+        else:
+            self.scale = None
+            self.bias = None
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("InstanceNorm Base error")
+
+    def forward(self, input):
+        self._check_input_dim(input)
+
+        return instance_norm(
+            input, weight=self.scale, bias=self.bias, eps=self._epsilon)
+
+
+class InstanceNorm1d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCL `[batch, in_channels, length]`
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm1d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm2d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm2d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm3d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, could be "NCDHW". Default: NCDHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm3d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class GroupNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``GroupNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Group Normalization Layer.
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+
+    Parameters:
+        num_channels(int): The number of channels of input.
+        num_groups(int): The number of groups that divided from channels.
+        epsilon(float, optional): The small value added to the variance to prevent
+                                  division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                                         If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                                        If it is set to None, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
+        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm_out = group_norm(x)
+
+          print(group_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_groups,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_layout='NCHW',
+                 name=None):
+        super(GroupNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._num_channels = num_channels
+        self._num_groups = num_groups
+        if data_layout != 'NCHW':
+            raise ValueError("unsupported data layout:" + data_layout)
+
+        param_shape = [self._num_channels]
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr or False,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+
+        self.bias = self.create_parameter(
+            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        inputs = {'X': input}
+        if self.bias is not None:
+            inputs['Bias'] = self.bias
+        if self.weight is not None:
+            inputs['Scale'] = self.weight
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        group_norm_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon,
+                   "groups": self._num_groups})
+
+        return self._helper.append_activation(group_norm_out, None)
+
+
+class LayerNorm(layers.Layer):
+    """
+    :alias_main: paddle.nn.LayerNorm
+	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
+	:old_api: paddle.fluid.dygraph.LayerNorm
+
+    This interface is used to construct a callable object of the ``LayerNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    ..  math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
+
+        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
+
+    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`H`: the number of hidden units in a layers
+    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`g`: the trainable scale parameter.
+    - :math:`b`: the trainable bias parameter.
+
+    Parameters:
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 2-D, 3-D, 4-D or 5-D tensor.
+        - output: same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = [normalized_shape]
+
+        self._normalized_shape = list(normalized_shape)
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        param_shape = [np.prod(self._normalized_shape)]
+
+        if weight_attr is False:
+            self.weight = None
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        return layer_norm(
+            input,
+            normalized_shape=self._normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            epsilon=self._epsilon)
+
+
+class _BatchNormBase(layers.Layer):
+    """
+    BatchNorm base .
+    """
+
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 track_running_stats=True,
+                 name=None):
+        super(_BatchNormBase, self).__init__()
+        self._num_features = num_features
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if get_default_dtype() == 'float16':
+            set_default_dtype('float32')
+
+        param_shape = [num_features]
+
+        # create parameter
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+        self.weight.stop_gradient = (self._weight_attr is False) or (
+            self._weight_attr and self._weight_attr.learning_rate == 0.)
+
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=param_shape, is_bias=True)
+        self.bias.stop_gradient = (self._bias_attr is False) or (
+            self._bias_attr and self._bias_attr.learning_rate == 0.)
+
+        moving_mean_name = None
+        moving_variance_name = None
+
+        if name is not None:
+            moving_mean_name = name + "_mean"
+            moving_variance_name = name + "_variance"
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+        self._data_format = data_format
+        self._in_place = False
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._fuse_with_relu = False
+        self._track_running_stats = track_running_stats
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("BatchNorm Base error")
+
+    def forward(self, input):
+
+        self._check_input_dim(input)
+
+        if not self.training and not self._track_running_stats:
+            raise ValueError(
+                'When inference, expected track_running_stats is True.')
+
+        if self.training and not self._track_running_stats:
+            warnings.warn(
+                "When training, we now always track global mean and variance.")
+
+        return batch_norm(
+            input,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format)
+
+
+class BatchNorm1d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+    
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm1d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm2d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm2d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm3d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCDHW". Default: NCDHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm3d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
 class SyncBatchNorm(layers.Layer):
     """
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr_scheduler.py
index d01e62abaa6374e7fde892c6ae52c16b4b0f13e2..4ecaffb8fa509bdc54067bb25f8d1b5191b7ac1b 100644
--- a/python/paddle/optimizer/lr_scheduler.py
+++ b/python/paddle/optimizer/lr_scheduler.py
@@ -153,7 +153,7 @@ class NoamLR(_LRScheduler):
         warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
         learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``NoamLR`` instance to schedule learning rate.
@@ -168,14 +168,14 @@ class NoamLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
@@ -185,14 +185,13 @@ class NoamLR(_LRScheduler):
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -204,7 +203,7 @@ class NoamLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
 
     """
@@ -251,7 +250,7 @@ class PiecewiseLR(_LRScheduler):
         values(list): A list of learning rate values that will be picked during different epoch boundaries. 
             The type of element in the list is python float.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``PiecewiseLR`` instance to schedule learning rate.
@@ -267,14 +266,14 @@ class PiecewiseLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
@@ -284,14 +283,13 @@ class PiecewiseLR(_LRScheduler):
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -303,7 +301,7 @@ class PiecewiseLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
     """
 
@@ -336,7 +334,7 @@ class NaturalExpLR(_LRScheduler):
         learning_rate (float): The initial learning rate. It is a python float number.
         gamma (float, optional): A Ratio to update the learning rate. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``NaturalExpLR`` instance to schedule learning rate.
@@ -352,14 +350,14 @@ class NaturalExpLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
@@ -369,14 +367,13 @@ class NaturalExpLR(_LRScheduler):
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -388,7 +385,7 @@ class NaturalExpLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
     """
 
@@ -416,7 +413,7 @@ class InverseTimeLR(_LRScheduler):
         gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``InverseTimeLR`` instance to schedule learning rate.
@@ -432,14 +429,14 @@ class InverseTimeLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
@@ -449,14 +446,13 @@ class InverseTimeLR(_LRScheduler):
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -468,7 +464,7 @@ class InverseTimeLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
 
     """
@@ -513,7 +509,7 @@ class PolynomialLR(_LRScheduler):
         cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease 
             to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``PolynomialLR`` instance to schedule learning rate.
@@ -529,31 +525,30 @@ class PolynomialLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
 
-            # train on statich mode
+            # train on static mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -565,7 +560,7 @@ class PolynomialLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
     """
 
@@ -629,7 +624,7 @@ class LinearLrWarmup(_LRScheduler):
         start_lr (float): Initial learning rate of warm up.
         end_lr (float): Final learning rate of warm up.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``LinearLrWarmup`` instance to schedule learning rate.
@@ -653,25 +648,24 @@ class LinearLrWarmup(_LRScheduler):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
 
-            # train on statich mode
+            # train on static mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.LinearLrWarmup(
+                scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup(
                     learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -683,7 +677,7 @@ class LinearLrWarmup(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()      
     """
 
@@ -733,10 +727,10 @@ class ExponentialLR(_LRScheduler):
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
-            It should be less than 1.0. Default: 0.1.
+        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``ExponentialLR`` instance to schedule learning rate.
@@ -752,31 +746,30 @@ class ExponentialLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
 
-            # train on statich mode
+            # train on static mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -788,7 +781,7 @@ class ExponentialLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
     """
 
@@ -824,7 +817,7 @@ class MultiStepLR(_LRScheduler):
         gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
         
 
     Returns:
@@ -841,31 +834,30 @@ class MultiStepLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
 
-            # train on statich mode
+            # train on static mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -877,7 +869,7 @@ class MultiStepLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
     """
 
@@ -934,7 +926,7 @@ class StepLR(_LRScheduler):
         gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``StepLR`` instance to schedule learning rate.
@@ -951,31 +943,30 @@ class StepLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
 
-            # train on statich mode
+            # train on static mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -987,7 +978,7 @@ class StepLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
     """
 
@@ -1032,7 +1023,7 @@ class LambdaLR(_LRScheduler):
         learning_rate (float): The initial learning rate. It is a python float number.
         lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
     
     Returns:
         ``LambdaLR`` instance to schedule learning rate.
@@ -1048,31 +1039,30 @@ class LambdaLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
 
-            # train on statich mode
+            # train on static mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -1084,7 +1074,7 @@ class LambdaLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
 
     """
@@ -1130,8 +1120,8 @@ class ReduceLROnPlateau(_LRScheduler):
             change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
         cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
         min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
-        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than eps, the update is
-            ignored. Default: 1e-8.
+        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon, 
+            the update is ignored. Default: 1e-8.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.
 
     
@@ -1149,31 +1139,30 @@ class ReduceLROnPlateau(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step(loss)
 
-            # train on statich mode
+            # train on static mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -1185,7 +1174,7 @@ class ReduceLROnPlateau(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step(out[0])
 
     """
@@ -1351,7 +1340,7 @@ class CosineAnnealingLR(_LRScheduler):
         T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate.
         eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
         ``CosineAnnealingLR`` instance to schedule learning rate.
@@ -1367,31 +1356,30 @@ class CosineAnnealingLR(_LRScheduler):
             paddle.disable_static()
             x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+            scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
                     x = paddle.to_tensor(x)
                     out = linear(x)
                     loss = paddle.reduce_mean(out)
-                    out.backward()
+                    loss.backward()
                     sgd.minimize(loss)
                     linear.clear_gradients()
                 scheduler.step()
 
-            # train on statich mode
+            # train on static mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[-1, 4, 5])
-                y = paddle.static.data(name='y', shape=[-1, 4, 5])
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+                scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
-                lr_var = sgd._global_learning_rate()
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
@@ -1403,7 +1391,7 @@ class CosineAnnealingLR(_LRScheduler):
                             'x': np.random.randn(3, 4, 5).astype('float32'),
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
-                        fetch_list=lr_var.name)
+                        fetch_list=loss.name)
                 scheduler.step()
     """
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 65469759a38087b2919ada6e73aebaaadf93c905..44ec0a5a4df985d5217011a841065ce504483ab7 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -27,7 +27,6 @@ from ..fluid.layers import expand_as  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
 from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
-from ..fluid.layers import unique  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS
 
 from ..fluid.layers import scatter_nd_add  #DEFINE_ALIAS
@@ -608,6 +607,126 @@ def squeeze(x, axis=None, name=None):
     return layers.squeeze(x, axis, name)
 
 
+def unique(x,
+           return_index=False,
+           return_inverse=False,
+           return_counts=False,
+           axis=None,
+           name=None):
+    """
+    Returns the unique elements of `x` in ascending order.
+
+    Args:
+        x(Tensor): The input tensor, it's data type should be float32, float64, int32, int64.
+        return_index(bool, optional): If True, also return the indices of the input tensor that
+            result in the unique Tensor.
+        return_inverse(bool, optional): If True, also return the indices for where elements in
+            the original input ended up in the returned unique tensor.
+        return_counts(bool, optional): If True, also return the counts for each unique element.
+        axis(int, optional): The axis to apply unique. If None, the input will be flattened.
+            Default: None.
+        name(str, optional): Name for the operation. For more information, please refer to
+            :ref:`api_guide_Name`. Default: None.
+
+    Returns: 
+        tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
+            provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \
+            is True. `counts` is provided only if `return_counts` is True.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            x_data = np.array([2, 3, 3, 1, 5, 3])
+            x = paddle.to_tensor(x_data)
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [1 2 3 5]
+            _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
+            np_indices = indices.numpy() # [3 0 1 4]
+            np_inverse = inverse.numpy() # [1 2 2 0 3 2]
+            np_counts = counts.numpy() # [1 1 3 1]
+
+            x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [0 1 2 3]
+
+            unique = paddle.unique(x, axis=0)
+            np_unique = unique.numpy() 
+            # [[2 1 3]
+            #  [3 0 1]]
+    """
+    if axis is None:
+        axis = []
+    else:
+        axis = [axis]
+
+    if in_dygraph_mode():
+        out, inverse, indices, counts = core.ops.unique(
+            x, 'dtype',
+            convert_np_dtype_to_dtype_('int32'), 'return_index', return_index,
+            'return_inverse', return_inverse, 'return_counts', return_counts,
+            'axis', axis, "is_sorted", True)
+        outs = [out]
+        if return_index:
+            outs.append(indices)
+        if return_inverse:
+            outs.append(inverse)
+        if return_counts:
+            outs.append(counts)
+
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    check_variable_and_dtype(x, "input",
+                             ['float32', 'float64', 'int32', 'int64'], 'unique')
+    check_type(return_index, 'return_index', bool, 'unique')
+    check_type(return_inverse, 'return_inverse', bool, 'unique')
+    check_type(return_counts, 'return_counts', bool, 'unique')
+    if len(axis) != 0:
+        check_type(axis[0], 'axis', int, 'unique')
+
+    helper = LayerHelper('unique', **locals())
+    attrs = {
+        'dtype': int(core.VarDesc.VarType.INT32),
+        "return_index": return_index,
+        "return_inverse": return_inverse,
+        "return_counts": return_counts,
+        "axis": axis,
+        "is_sorted": True
+    }
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    inverse = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+    outputs = {"Out": out, "Index": inverse}
+    outs = [out]
+    if return_index:
+        indices = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+        outputs["Indices"] = indices
+        outs.append(indices)
+    if return_inverse:
+        outs.append(inverse)
+    if return_counts:
+        counts = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+        outputs["Counts"] = counts
+        outs.append(counts)
+
+    helper.append_op(
+        type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs)
+
+    if len(outs) == 1:
+        return outs[0]
+
+    return tuple(outs)
+
+
 def unsqueeze(x, axis, name=None):
     """
 	:alias_main: paddle.unsqueeze
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
old mode 100644
new mode 100755
index 85441597cf56c61ef33d552ec6a9f7c5019b4ec8..4c7eef5fa65108b35cf9792280eeb6c1e7ddf3fc
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -562,34 +562,52 @@ floor_mod = remainder  #DEFINE_ALIAS
 
 def multiply(x, y, axis=-1, name=None):
     """
-	:alias_main: paddle.multiply
-	:alias: paddle.multiply,paddle.tensor.multiply,paddle.tensor.math.multiply
+    multiply two tensors element-wise. The equation is:
 
-Examples:
+    .. math::
+        out = x * y
 
-    .. code-block:: python
+    **Note**:
+    ``paddle.multiply`` supports broadcasting. If you would like to know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        import paddle
-        import numpy as np
+    Args:
+        x (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        paddle.disable_static()
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
-        res = paddle.multiply(x, y)
-        print(res.numpy()) # [[5, 12], [21, 32]]
+    Returns:
+        N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.
 
-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
-        res = paddle.multiply(x, y, axis=1)
-        print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
+    Examples:
+
+        ..  code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+            y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y)
+            print(res.numpy()) # [[5, 12], [21, 32]]
+
+            x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
+            y_data = np.array([1, 2], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y, axis=1)
+            print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
 
     """
     op_type = 'elementwise_mul'
     act = None
+    if x.dtype != y.dtype:
+        raise TypeError(
+            'Input tensors must be same type, but received type of x: %s, type of y: %s '
+            % (x.dtype, y.dtype))
+
     if in_dygraph_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 91ad3bfa9cc1babd22cf9419ede33ae26f0dc900..eede022e05ba61bc23da517e7af7cd2eb58f5416 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -21,7 +21,6 @@ from ..fluid import core, layers
 from ..fluid.layers import argmin  #DEFINE_ALIAS
 from ..fluid.layers import has_inf  #DEFINE_ALIAS
 from ..fluid.layers import has_nan  #DEFINE_ALIAS
-from ..fluid.layers import topk  #DEFINE_ALIAS
 
 __all__ = [
     'argmax',
@@ -756,3 +755,100 @@ def masked_select(x, mask, name=None):
         type='masked_select', inputs={'X': x,
                                       'Mask': mask}, outputs={'Y': out})
     return out
+
+
+def topk(x, k, axis=None, largest=True, sorted=True, name=None):
+    """
+    This OP is used to find values and indices of the k largest or smallest at the optional axis.
+    If the input is a 1-D Tensor, finds the k largest or smallest values and indices.
+    If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`.
+
+    Args:
+        x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64.
+        k(int, Tensor): The number of top elements to look for along the axis.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is -1.
+        largest(bool, optional) : largest is a flag, if set to true,
+            algorithm will sort by descending order, otherwise sort by
+            ascending order. Default is True.
+        sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value. 
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
+
+    Examples:
+
+        .. code-block:: python
+
+           import numpy as np
+           import paddle
+
+           paddle.disable_static()
+
+           data_1 = np.array([1, 4, 5, 7])
+           tensor_1 = paddle.to_tensor(data_1)
+           value_1, indices_1 = paddle.topk(tensor_1, k=1)
+           print(value_1.numpy())
+           # [7]
+           print(indices_1.numpy())
+           # [3] 
+           data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
+           tensor_2 = paddle.to_tensor(data_2)
+           value_2, indices_2 = paddle.topk(tensor_2, k=1)
+           print(value_2.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_2.numpy())
+           # [[3]
+           #  [1]]
+           value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1)
+           print(value_3.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_3.numpy())
+           # [[3]
+           #  [1]]
+           value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0)
+           print(value_4.numpy())
+           # [[2 6 5 7]]
+           print(indices_4.numpy())
+           # [[1 1 0 0]]
+
+    """
+    if in_dygraph_mode():
+        k = k.numpy().item(0) if isinstance(k, Variable) else k
+        if axis is None:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'largest', largest,
+                                             'sorted', sorted)
+        else:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'axis', axis, 'largest',
+                                             largest, 'sorted', sorted)
+        return out, indices
+
+    helper = LayerHelper("top_k_v2", **locals())
+    inputs = {"X": [x]}
+    attrs = {}
+    if isinstance(k, Variable):
+        inputs['K'] = [k]
+    else:
+        attrs = {'k': k}
+    attrs['largest'] = largest
+    attrs['sorted'] = sorted
+    if axis is not None:
+        attrs['axis'] = axis
+
+    values = helper.create_variable_for_type_inference(dtype=x.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
+
+    helper.append_op(
+        type="top_k_v2",
+        inputs=inputs,
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs=attrs)
+    indices.stop_gradient = True
+    return values, indices
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 049621b9388997deaeab618c09c579858a60d47e..b10e76a4b4d037bfa0d72e74e660cf696f5ee1d3 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -63,12 +63,12 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /o
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index aa547fe74163fdbe08c1266817b7f1903e541ad5..9fe58885fa553671cf5c08bd51295f271f4df668 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -156,19 +156,19 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
+    pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
+    pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
+    pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.8 --no-cache-dir install opencv-python && \
+    pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
+    pip --no-cache-dir install opencv-python==4.2.0.32
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index 6f201a8579fea29ec6eaabf1faca77da26b11882..9f937cf9343784f10d186dd5bdcbace6f8a4e0e9 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -89,7 +89,7 @@ function do_cpython_build {
     fi
     # NOTE Make libpython shared library visible to python calls below
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
-    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
     cd /
     ls ${MY_DIR}
     local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
index ffef02dba4614f7bbbe13ebc30b40438a52b4590..e3a3374b943bc955d54afbef9755ed5147fad7d2 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -199,12 +198,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 837f0e486f6112bfc645c55ded8dfd0726d414d6..c27fdcea2401c26b1ef1dd377c42930b6e74fcf0 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -212,12 +211,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure] 
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 102b50c43aeabc6ab2c67840edfaf42615cf51f5..033b4b8723aa30465cdb07198f470d7c09a0f326 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -480,14 +480,8 @@ def get_filenames():
                 filename = ''
                 print("\nWARNING:----Exception in get api filename----\n")
                 print("\n" + api + ' module is ' + module + "\n")
-            if filename != '':
-                # rm contrib file
-                if filename.startswith(
-                        '../python/paddle/fluid/contrib'
-                ) or filename == '../python/paddle/verison.py':
-                    pass
-                elif filename not in filenames:
-                    filenames.append(filename)
+            if filename != '' and filename not in filenames:
+                filenames.append(filename)
             # get all methods
             method = ''
             if inspect.isclass(eval(api)):
@@ -557,14 +551,18 @@ def get_wlist():
 
     '''
     wlist = []
+    wlist_file = []
     with open("wlist.json", 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
-            wlist = wlist + load_dict[key]
-    return wlist
+            if key == 'wlist_file':
+                wlist_file = wlist_file + load_dict[key]
+            else:
+                wlist = wlist + load_dict[key]
+    return wlist, wlist_file
 
 
-wlist = get_wlist()
+wlist, wlist_file = get_wlist()
 
 if len(sys.argv) < 2:
     print("Error: inadequate number of arguments")
@@ -590,8 +588,14 @@ else:
     if len(filenames) == 0 and len(whl_error) == 0:
         print("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
-    elif '../python/paddle/fluid/core_avx.py' in filenames:
-        filenames.remove('../python/paddle/fluid/core_avx.py')
+    rm_file = []
+    for f in filenames:
+        for w_file in wlist_file:
+            if f.startswith(w_file):
+                rm_file.append(f)
+                filenames.remove(f)
+    if len(rm_file) != 0:
+        print("REMOVE white files: %s" % rm_file)
     print("API_PR is diff from API_DEV: %s" % filenames)
     one_part_filenum = int(math.ceil(len(filenames) / cpus))
     if one_part_filenum == 0:
diff --git a/tools/wlist.json b/tools/wlist.json
index 6a0360fbcd9d06885d8dff7044e0cce63fa7d92c..c6114918e5932a9cfd139fd0212698c5ea97d3cc 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -1,4 +1,10 @@
 {
+    "wlist_file" : [
+        "../python/paddle/fluid/contrib", 
+        "../python/paddle/verison.py",
+        "../python/paddle/fluid/core_avx.py",
+        "../python/paddle/distributed"
+    ],
     "wlist_inneed":[
         "append_LARS",
         "BuildStrategy.debug_graphviz_path",