Merge https://github.com/PaddlePaddle/Paddle into develop

6ad85cbd · jingqinghe · 896ed573 · c70bc3bb · 6ad85cbd · 6ad85cbd
58 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX

-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}

@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.

-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
+
 RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
    pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
    pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
    pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark

 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
 RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
 RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker

-RUN pip3 --no-cache-dir install coverage                
-RUN pip3.6 --no-cache-dir install coverage             
-RUN pip3.7 --no-cache-dir install coverage            
+RUN pip3 --no-cache-dir install coverage
+RUN pip3.6 --no-cache-dir install coverage
+RUN pip3.7 --no-cache-dir install coverage
 RUN pip --no-cache-dir install coverage

 COPY ./python/requirements.txt /root/
@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]


-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)

 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST)
            SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
            DSTS ${dst_dir} ${dst_dir}/lib)

+    if (WITH_CRYPTO)
        set(dst_dir "${DST}/third_party/install/cryptopp")
        copy(${TARGET}
-        SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
+            SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib)
+    endif()

    set(dst_dir "${DST}/third_party/install/xxhash")
    copy(${TARGET}

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item,
      TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
    } else {
-      dst_item->ShareDataWith(src_item);
+      TensorCopy(src_item, platform::CPUPlace(), dst_item);
    }
  } else {
    dst_item->clear();

--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -113,7 +113,9 @@ message DistributedStrategy {
  optional bool fuse_all_reduce_ops = 18 [ default = true ];
  optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
  optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
-  // optional bool enable_backward_optimizer_op_deps = 19 [ default = true ];
+  optional bool cudnn_exhaustive_search = 21 [ default = true ];
+  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
+  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];

  optional RecomputeConfig recompute_configs = 101;
  optional AMPConfig amp_configs = 102;

--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -29,14 +29,20 @@ namespace framework {
 namespace compatible {

 struct OpUpdateRecord {
-  enum class Type { kInvalid = 0, kModifyAttr, kNewAttr };
+  enum class Type {
+    kInvalid = 0,
+    kModifyAttr,
+    kNewAttr,
+    kNewInput,
+    kNewOutput
+  };
  Type type_;
  std::string remark_;
 };

 struct ModifyAttr : OpUpdateRecord {
  ModifyAttr(const std::string& name, const std::string& remark,
-             boost::any default_value)
+             const boost::any& default_value)
      : OpUpdateRecord({Type::kModifyAttr, remark}),
        name_(name),
        default_value_(default_value) {
@@ -47,9 +53,10 @@ struct ModifyAttr : OpUpdateRecord {
  std::string name_;
  boost::any default_value_;
 };
+
 struct NewAttr : OpUpdateRecord {
  NewAttr(const std::string& name, const std::string& remark,
-          boost::any default_value)
+          const boost::any& default_value)
      : OpUpdateRecord({Type::kNewAttr, remark}),
        name_(name),
        default_value_(default_value) {}
@@ -59,6 +66,22 @@ struct NewAttr : OpUpdateRecord {
  boost::any default_value_;
 };

+struct NewInput : OpUpdateRecord {
+  NewInput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewInput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
+struct NewOutput : OpUpdateRecord {
+  NewOutput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
 class OpVersionDesc {
 public:
  OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
@@ -75,6 +98,18 @@ class OpVersionDesc {
    return *this;
  }

+  OpVersionDesc& NewInput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewInput(name, remark)));
+    return *this;
+  }
+
+  OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewOutput(name, remark)));
+    return *this;
+  }
+
 private:
  std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
 };

--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -42,7 +42,14 @@ TEST(test_operator_version, test_operator_version) {
              "height",
              "In order to represent a two-dimensional rectangle, the "
              "parameter height is added.",
-              0));
+              0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a input [X2] and a output [Y2]
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .NewInput("X2", "The second input.")
+              .NewOutput("Y2", "The second output."));
 }
 }  // namespace compatible
 }  // namespace framework

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32)
    SRCS analyzer_tester.cc
    EXTRA_DEPS reset_tensor_array paddle_fluid_shared
    ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
-elseif(NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
-  inference_analysis_test(test_analyzer
-    SRCS analyzer_tester.cc
-    EXTRA_DEPS reset_tensor_array paddle_inference_api
-    ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
+elseif(WIN32)
+    inference_analysis_test(test_analyzer
+      SRCS analyzer_tester.cc
+      EXTRA_DEPS reset_tensor_array paddle_inference_api
+      ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
 endif()
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -54,8 +54,7 @@ if(WITH_TESTING)
                        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
-  elseif(NOT WIN32)
-    # TODO: Fix this unittest failed on Windows
+  elseif(WIN32)
    inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
@@ -67,8 +66,7 @@ endif()
 if (NOT APPLE AND NOT WIN32)
  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared
          ARGS --dirname=${WORD2VEC_MODEL_DIR})
-elseif (NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
+elseif (WIN32)
  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
          ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML)
    set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
    download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
 else()
    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -191,6 +192,7 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
    ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
+set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150) 

 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")

--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -32,19 +32,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
      ${EXTERNAL_PROJECT_NAME}
      ${EXTERNAL_PROJECT_LOG_ARGS}
      PREFIX                ${INSTALL_DIR}
-      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-                            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
+      URL                   ${URL}/${FILENAME}
      DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_EXTRACT   1
      DOWNLOAD_NO_PROGRESS  1
      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
+      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
+                            ${CMAKE_COMMAND} -E tar xzf ${FILENAME}
      UPDATE_COMMAND        ""
      INSTALL_COMMAND       ""
  )
 endfunction()

 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
  inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")

--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/affine_grid_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_info.h"
+
 namespace paddle {
 namespace operators {

@@ -84,14 +86,14 @@ __global__ void affine_grid_grad_kernel(const int count, int n, int out_h,

    int theta_offset = n * 6;  // 2 * 3;
    T out_grad_x = out_grad[index * 2];
-    atomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
-    atomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
-    atomicAdd(theta_grad + theta_offset + 2, out_grad_x);
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);

    T out_grad_y = out_grad[index * 2 + 1];
-    atomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
-    atomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
-    atomicAdd(theta_grad + theta_offset + 5, out_grad_y);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
  }
 }


--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -31,7 +31,7 @@ static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
                                                  int sW, int H, int W,
                                                  T delta) {
  if (in_bounds(h, w, H, W)) {
-    atomicAdd(data + h * sH + w * sW, delta);
+    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
  }
 }


--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -111,7 +111,8 @@ static void CallPythonFunc(py::object *callable,
      out->set_lod(py_out_tensor->lod());
      out->ShareDataWith(*py_out_tensor);
    } catch (py::cast_error &) {
-      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The %d-th output must be LoDTensor.", i));
    }
  }
 }

--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdio.h>
+#include <cstdio>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<paddle::platform::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
+                 paddle::platform::float16> {};
+}  // namespace cub
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+// Iter using into a column
+struct ColumnIndexIter {
+  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
+      const Eigen::array<int, 1>& ix) const {
+    return ix[0] % num_cols_;
+  }
+
+  int num_cols_;
+};
+
+inline static int GetDesiredBlockDim(int dim) {
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+template <typename T>
+__global__ void InitIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
+    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+template <typename T>
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
+
+  __device__ __forceinline__ void set(T value, int64_t id) {
+    v = value;
+    id = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair<T>& in) {
+    v = in.v;
+    id = in.id;
+  }
+
+  __device__ __forceinline__ bool operator<(const T value) const {
+    return (v < value);
+  }
+
+  __device__ __forceinline__ bool operator>(const T value) const {
+    return (v > value);
+  }
+  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
+    return (v < in.v) || ((v == in.v) && (id > in.id));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
+    return (v > in.v) || ((v == in.v) && (id < in.id));
+  }
+
+  T v;
+  int64_t id;
+};
+
+template <typename T>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
+                                      int beam_size, const bool& largest) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (largest) {
+      if (topk[k] < p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    } else {
+      if (topk[k] > p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, int beam_size,
+                                        const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, const Pair<T>& max,
+                                        int beam_size, const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp < max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp > max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
+                                              int beam_size, const T* src,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
+                                              const int tid, bool largest) {
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
+      GetTopK<T, BlockSize>(topk, src, tid, dim, length, largest);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
+        } else {
+          topk[k].set(-static_cast<T>(INFINITY), -1);
+        }
+      }
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
+                              length, largest);
+      }
+    }
+
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
+    *beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
+                                            Pair<T> topk[], T** topVal,
+                                            int64_t** topIds, int* beam, int* k,
+                                            const int tid, const int warp,
+                                            const bool& largest) {
+  while (true) {
+    __syncthreads();
+    if (tid < BlockSize / 2) {
+      if (largest) {
+        if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      } else {
+        if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      }
+    }
+    __syncthreads();
+    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (largest) {
+          if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        } else {
+          if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = sh_topk[maxid[0]].v;
+      **topIds = sh_topk[maxid[0]].id;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
+    __syncthreads();
+
+    if (tid == maxid[0]) {
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
+      }
+    }
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
+    if (maxid[0] / 32 == warp) {
+      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
+          MaxLength)
+        break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top MaxLength value;
+ * 2. merge to sh_topk, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topk value is null;
+ * 4. go to the first setp, until get the topk value.
+ */
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
+                             const T* src, int lds, int dim, int k,
+                             int grid_dim, int num, bool largest = true) {
+  __shared__ Pair<T> sh_topk[BlockSize];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+
+  const int bid = blockIdx.x;
+  for (int i = bid; i < num; i += grid_dim) {
+    int top_num = k;
+    __shared__ int maxid[BlockSize / 2];
+    T* out = output + i * output_stride;
+    int64_t* inds = indices + i * k;
+    Pair<T> topk[MaxLength];
+    int beam = MaxLength;
+    Pair<T> max;
+    bool is_empty = false;
+    bool firststep = true;
+
+    for (int j = 0; j < MaxLength; j++) {
+      if (largest) {
+        topk[j].set(-static_cast<T>(INFINITY), -1);
+      } else {
+        topk[j].set(static_cast<T>(INFINITY), -1);
+      }
+    }
+    while (top_num) {
+      ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k, src + i * lds,
+                                             &firststep, &is_empty, &max, dim,
+                                             tid, largest);
+
+      sh_topk[tid] = topk[0];
+      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
+                                           &beam, &top_num, tid, warp, largest);
+    }
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
+                           size_t rows, size_t cols, size_t k) {
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t j = 0; j < cols; ++j) {
+      x_grad[i * cols + j] = 0;
+    }
+    for (size_t j = 0; j < k; ++j) {
+      size_t idx = indices[i * k + j];
+      x_grad[i * cols + idx] = out_grad[i * k + j];
+    }
+  }
+}
+
+// the grad assign with the axis
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices,
+                                   T* grad_in, int pre, int post,
+                                   int raw_height, int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    const int& base_index = i * post * k;
+    const int& base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      const int64_t idx_ij = indices[base_index + j];
+      const int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[idx_ij];
+    }
+  }
+}
+// use the radix sort for the topk
+template <typename T>
+bool SortTopk(const platform::CUDADeviceContext& ctx,
+              const framework::Tensor* input_tensor, const int64_t num_cols,
+              const int64_t num_rows, const int k,
+              framework::Tensor* out_tensor, framework::Tensor* indices_tensor,
+              bool largest = true) {
+  auto cu_stream = ctx.stream();
+
+  Tensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = framework::make_ddim(dims);
+  input_indices.Resize(dim);
+  // input_indices.Resize(num_rows*num_cols);
+  input_indices.mutable_data<int64_t>(ctx.GetPlace());
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(num_cols);
+
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  // actually, int num_rows < max_grid_size
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  // Init a index array
+  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+
+  // create iter for counting input
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+
+  Tensor temp_values;
+  Tensor temp_indices;
+
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
+
+  if (k == num_cols) {
+    // Doing a full sort.
+    sorted_values_ptr = values;
+    sorted_indices_ptr = indices;
+  } else {
+    temp_values.Resize(dim);
+    temp_indices.Resize(dim);
+    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
+    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
+  }
+
+  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
+  // time.
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR)
+          << "TopKOP failed as could not launch "
+             "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+             "temp_storage_bytes, status: "
+          << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to calculate "
+                    "temp_storage_bytes, status: "
+                 << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  Tensor temp_storage;
+  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
+
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  auto& dev = *ctx.eigen_device();
+  if (k < num_cols) {
+    // copy sliced data to output.
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
+    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
+
+    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
+    auto dim = framework::make_ddim(odims);
+    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
+
+    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
+    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+  }
+  return true;
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -12,474 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma once
 #include <cstdio>
+#include <vector>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 // set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub

 namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;

-template <typename T>
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
-
-  __device__ __forceinline__ void set(T value, int64_t id) {
-    v = value;
-    id = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair<T>& in) {
-    v = in.v;
-    id = in.id;
-  }
-
-  __device__ __forceinline__ bool operator<(const T value) const {
-    return (v < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
-    return (v < in.v) || ((v == in.v) && (id > in.id));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
-    return (v > in.v) || ((v == in.v) && (id < in.id));
-  }
-
-  T v;
-  int64_t id;
-};
-
-template <typename T>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
-                                      int beam_size) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int beam_size>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* src,
-                                              bool* firstStep, bool* is_empty,
-                                              Pair<T>* max, int dim,
-                                              const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - (*beam)) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* val,
-                                              int* col, bool* firstStep,
-                                              bool* is_empty, Pair<T>* max,
-                                              int dim, const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - *beam) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
-                                            Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int* beam, int* k,
-                                            const int tid, const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < BlockSize / 2) {
-      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
-        maxid[tid] = tid + BlockSize / 2;
-      } else {
-        maxid[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
-          maxid[tid] = maxid[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = sh_topk[maxid[0]].v;
-      **topIds = sh_topk[maxid[0]].id;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxid[0]) (*beam)++;
-    if (--(*k) == 0) break;
-    __syncthreads();
-
-    if (tid == maxid[0]) {
-      if (*beam < MaxLength) {
-        sh_topk[tid] = topk[*beam];
-      }
-    }
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (maxid[0] / 32 == warp) {
-      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
-          MaxLength)
-        break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top MaxLength value;
- * 2. merge to sh_topk, block reduce and get max value;
- * 3. go to the second setp, until one thread's topk value is null;
- * 4. go to the first setp, until get the topk value.
- */
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k,
-                             int grid_dim, int num) {
-  __shared__ Pair<T> sh_topk[BlockSize];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-
-  const int bid = blockIdx.x;
-  for (int i = bid; i < num; i += grid_dim) {
-    int top_num = k;
-    __shared__ int maxid[BlockSize / 2];
-    T* out = output + i * output_stride;
-    int64_t* inds = indices + i * k;
-    Pair<T> topk[MaxLength];
-    int beam = MaxLength;
-    Pair<T> max;
-    bool is_empty = false;
-    bool firststep = true;
-
-    for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-static_cast<T>(INFINITY), -1);
-    }
-    while (top_num) {
-      ThreadGetTopK<T, MaxLength, BlockSize>(
-          topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
-
-      sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
-                                           &beam, &top_num, tid, warp);
-    }
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
-                           size_t rows, size_t cols, size_t k) {
-  for (size_t i = 0; i < rows; ++i) {
-    for (size_t j = 0; j < cols; ++j) {
-      x_grad[i * cols + j] = 0;
-    }
-    for (size_t j = 0; j < k; ++j) {
-      size_t idx = indices[i * k + j];
-      x_grad[i * cols + idx] = out_grad[i * k + j];
-    }
-  }
-}
-
-inline static int GetDesiredBlockDim(int dim) {
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-// Iter using into a column
-struct ColumnIndexIter {
-  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
-      const Eigen::array<int, 1>& ix) const {
-    return ix[0] % num_cols_;
-  }
-
-  int num_cols_;
-};
-
-__global__ void InitIndex(int64_t* indices, int64_t num_rows,
-                          int64_t num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
-    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T>
-bool SortTopk(const platform::CUDADeviceContext& ctx,
-              const framework::Tensor* input_tensor, const int64_t num_cols,
-              const int64_t num_rows, const int k,
-              framework::Tensor* out_tensor,
-              framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = framework::make_ddim(dims);
-  input_indices.Resize(dim);
-  // input_indices.Resize(num_rows*num_cols);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](int col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
-  // actually, int num_rows < max_grid_size
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  // Init a index array
-  InitIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-
-  // create iter for counting input
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-
-  Tensor temp_values;
-  Tensor temp_indices;
-
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-
-  if (k == num_cols) {
-    // Doing a full sort.
-    sorted_values_ptr = values;
-    sorted_indices_ptr = indices;
-  } else {
-    temp_values.Resize(dim);
-    temp_indices.Resize(dim);
-    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  }
-
-  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
-  // time.
-  auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
-           "temp_storage_bytes, status: "
-        << cudaGetErrorString(err);
-    return false;
-  }
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-           "temp_storage_bytes: "
-        << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-  auto& dev = *ctx.eigen_device();
-  if (k < num_cols) {
-    // copy sliced data to output.
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
-    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
-
-    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
-    auto dim = framework::make_ddim(odims);
-    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
-
-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
-  }
-  return true;
-}
-
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
  case (dim): {                        \
    constexpr auto kBlockDim = (dim);  \
@@ -523,7 +70,6 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
    const int64_t input_width = inputdims[inputdims.size() - 1];
    const auto& dev_ctx = ctx.cuda_device_context();
-
    if ((input_width <= 1024 || k >= 128 || k == input_width)) {
      if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
                      indices)) {
@@ -576,7 +122,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
        framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1));
    const size_t col = xdims[xdims.size() - 1];
    const auto& dev_ctx = context.cuda_device_context();
-
    const int kMaxHeight = 2048;
    int gridx = row < kMaxHeight ? row : kMaxHeight;
    switch (GetDesiredBlockDim(col)) {
@@ -595,7 +140,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {

 }  // namespace operators
 }  // namespace paddle
-
 REGISTER_OP_CUDA_KERNEL(
    top_k,
    paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/top_k_v2_op.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class TopkV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of TopkOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int& dim_size = input_dims.size();
+    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
+                      "the axis of topk"
+                      "must be [-%d, %d), but you set axis is %d",
+                      dim_size, dim_size, axis);
+
+    if (axis < 0) axis += dim_size;
+
+    PADDLE_ENFORCE_GE(
+        k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
+    PADDLE_ENFORCE_GE(input_dims.size(), 1,
+                      "input of topk must have >= 1d shape");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GE(
+          input_dims[axis], k,
+          "input of topk op must have >= %d columns in axis of %d", k, axis);
+    }
+
+    framework::DDim dims = input_dims;
+
+    dims[axis] = k;
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Topk op");
+    AddInput("K",
+             "(Tensor)  Number of top elements to look for along "
+             "the last dimension (along each row for matrices).")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddComment(R"DOC(
+Top K operator
+
+If the input is a vector (1d tensor), this operator finds the k largest 
+entries in the vector and outputs their values and indices as vectors. 
+Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+
+For matrices, this operator computes the top k entries in each row. )DOC");
+    AddAttr<int>("k",
+                 "(int, default 1) Number of top elements to look for along "
+                 "the tensor).")
+        .SetDefault(1);
+    AddAttr<int>("axis",
+                 "the axis to sort and get the k indices, value."
+                 "if not set, will get k value in last axis.")
+        .SetDefault(-1);
+    AddAttr<bool>("largest",
+                  "control flag whether to return largest or smallest")
+        .SetDefault(true);
+    AddAttr<bool>("sorted",
+                  "control flag whether to return elements in sorted order")
+        .SetDefault(true);
+  }
+};
+
+class TopkV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument("Input(X) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Indices"), true,
+        platform::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Grad Input(Out) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename T>
+class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("top_k_v2_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Output("Indices"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
+                  ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(top_k_v2,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
+
+REGISTER_OP_CPU_KERNEL(
+    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
--- a/paddle/fluid/operators/top_k_v2_op.cu
+++ b/paddle/fluid/operators/top_k_v2_op.cu
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/p_norm_op.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/fluid/operators/top_k_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename DeviceContext, typename T>
+class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+
+    // get the attributes
+    int k = static_cast<int>(ctx.Attr<int>("k"));
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
+    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
+
+    // get the input dims
+    const auto& in_dims = input->dims();
+    // calcluate the real axis
+    if (axis < 0) axis += in_dims.size();
+
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      Tensor k_host;
+      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
+      k = k_host.data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    const auto& out_dims = output->dims();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    if (axis == in_dims.size() - 1) {
+      // if get the topK from the last axis
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      const auto& dev_ctx = ctx.cuda_device_context();
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
+                        indices, largest)) {
+          // Successed, return.
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      // NOTE: pass lds and dim same to input width.
+      // NOTE: old matrix implementation of stride is different to eigen.
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                output_data, k, indices_data, input_data, input_width,
+                input_width, static_cast<int>(k), gridx, input_height,
+                largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+    } else {
+      // if get topK not from the last axis, will tranpose the tensor and get
+      // TopK
+
+      // first step, prepare the trans args for the tranpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (int i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+      // second step, tranpose the input
+      Tensor trans_input;
+      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
+      int ndims = trans.size();
+      const auto& dev_ctx = ctx.cuda_device_context();
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
+                                                   &trans_input, trans);
+      // third step, calcluate the topk
+      // allocate the tmp cuda memory for the tmp result
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
+      Tensor trans_out;
+      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
+                        &trans_out, &trans_ind, largest)) {
+          // last step, tranpose back the indices and output
+          TransCompute<platform::CUDADeviceContext, int64_t>(
+              ndims, dev_ctx, trans_ind, indices, trans);
+          TransCompute<platform::CUDADeviceContext, T>(
+              ndims, dev_ctx, trans_out, output, trans);
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
+                trans_input.data<T>(), input_width, input_width,
+                static_cast<int>(k), gridx, input_height, largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+
+      // last step, tranpose back the indices and output
+      TransCompute<platform::CUDADeviceContext, int64_t>(
+          ndims, dev_ctx, trans_ind, indices, trans);
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
+                                                   output, trans);
+    }
+  }
+};
+
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+template <typename DeviceContext, typename T>
+class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = context.Attr<int>("axis");
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // get the real the axis and the k
+    if (axis < 0) axis += in_dims.size();
+    const int& k = out_dims[axis];
+    const int& raw_height = in_dims[axis];
+
+    // allocate the cuda memory for the x_grad
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+
+    int pre, n, post;
+    GetDims(in_dims, axis, &pre, &n, &post);
+
+    // calcluate the block and grid num
+    auto& dev_ctx = context.cuda_device_context();
+    auto ComputeBlockSize = [](int col) {
+      if (col > 512)
+        return 1024;
+      else if (col > 256 && col <= 512)
+        return 512;
+      else if (col > 128 && col <= 256)
+        return 256;
+      else if (col > 64 && col <= 128)
+        return 128;
+      else
+        return 64;
+    };
+    int block_size = ComputeBlockSize(post * k);
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+    int grid_size = std::min(max_blocks, pre);
+
+    // lanuch the cuda kernel to assign the grad
+    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          float>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          double>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
+                       paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int64_t>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
--- a/paddle/fluid/operators/top_k_v2_op.h
+++ b/paddle/fluid/operators/top_k_v2_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+  The reason why we need the topk v2 is because the compatibility. We redefine
+  the NaN is maximum value
+  in the process of comparing. If do not add the topk v2,  will affect the
+  inference result of model that traing
+  by the older version paddlepaddle.
+*/
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height, Type input_width, int input_dim,
+                     const framework::Tensor* input, T* t_out, Type* t_indices,
+                     const int& k, const bool& largest, const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(), col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height, const Type& input_width,
+                           const int& input_dim, const framework::Tensor* input,
+                           const framework::Tensor* indices, T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class TopkV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Get the top k elements of each row of input tensor
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto* indices = context.Output<Tensor>("Indices");
+    const auto& in_dims = input->dims();
+    int k = static_cast<int>(context.Attr<int>("k"));
+    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
+    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
+
+    // axis < 0, cacluate the real axis
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    if (axis < 0) axis += in_dims.size();
+
+    // if K tensor is not null, will the use K tesnor as k
+    auto* k_t = context.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      // accroding to axis to set K value in the dim
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    const auto& out_dims = output->dims();
+    if (axis + 1 == in_dims.size()) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
+                           output_data, indices_data, k, largest, sorted);
+    } else {
+      // if the topk dims is not last dim, will tranpose and do topk
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.push_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      // get the trans input_dims, out_dims
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+      }
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+
+      Tensor trans_inp;
+      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // transpose the input value
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
+                                                  &trans_inp, trans);
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      // Allocate the temp tensor to the save the topk indices, values
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
+      Tensor tmp_indices;
+      auto* t_ind =
+          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
+
+      // get the TopK value
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
+                           &trans_inp, t_out, t_ind, k, largest, sorted);
+      // transpose back
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, tmp_indices, indices, trans);
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  output, trans);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TopkV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // axis < 0, get the real axis
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+    const size_t& k = out_dims[axis];
+
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    if (axis + 1 == in_dims.size()) {
+      // allocate the memory for the input_grad
+
+      // assign the out_grad to input_grad directly
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t input_width = in_dims[in_dims.size() - 1];
+
+      // init the output grad with 0, because some input elements has no grad
+      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+      // Assign the output_grad to input_grad
+      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
+                     indices, x_grad_data, k);
+    } else {
+      // can not assign grad to input_grad, must do the transpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(out_dims.size() - 1);
+      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+      framework::DDim trans_dims(out_dims);
+      framework::DDim trans_in_dims(in_dims);
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = out_dims[trans[i]];
+        trans_in_dims[i] = in_dims[trans[i]];
+      }
+      // transpose the out_grad, indices
+      Tensor trans_dO;
+      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // Do transpose
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
+                                                  &trans_dO, trans);
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, *indices, &trans_ind, trans);
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+      // Assign the out_grad to tranpose input_grad
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
+      memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
+                                 &trans_dO, &trans_ind, t_out, k);
+
+      // Transpose back
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  x_grad, trans);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -24,17 +24,63 @@ class UniqueOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-    OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-
    auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), 1,
-        platform::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
-                                          "But now the dims of Input(X) is %d.",
-                                          in_dims.size()));
+    if (!ctx->Attrs().Get<bool>("is_sorted")) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+      PADDLE_ENFORCE_EQ(in_dims.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The Input(X) should be 1-D Tensor, "
+                            "But now the dims of Input(X) is %d.",
+                            in_dims.size()));
+
+      ctx->SetOutputDim("Out", {-1});
+      ctx->SetOutputDim("Index", in_dims);
+      return;
+    }
+
+    bool return_index = ctx->Attrs().Get<bool>("return_index");
+    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
+    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
+    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
+
+    if (return_index) {
+      OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
+    }
+    if (return_inverse) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+    }
+    if (return_counts) {
+      OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
+    }

-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
+    if (axis_vec.empty()) {
+      ctx->SetOutputDim("Out", {-1});
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {framework::product(in_dims)});
+      }
+    } else {
+      int axis = axis_vec[0];
+      if (axis < 0) {
+        axis += in_dims.size();
+      }
+      PADDLE_ENFORCE_LT(
+          axis, in_dims.size(),
+          platform::errors::InvalidArgument("The axis(%d) should be less than "
+                                            "the dimension size(%d) of x.",
+                                            axis, in_dims.size()));
+      auto out_dims = in_dims;
+      out_dims[axis] = -1;
+      ctx->SetOutputDim("Out", out_dims);
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {in_dims[axis]});
+      }
+    }
+    if (return_index) {
+      ctx->SetOutputDim("Indices", {-1});
+    }
+    if (return_counts) {
+      ctx->SetOutputDim("Counts", {-1});
+    }
  }

 protected:
@@ -49,14 +95,47 @@ class UniqueOp : public framework::OperatorWithKernel {
 class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
+    AddInput("X",
+             "Input tensor. It should be a 1-D tensor when Attr(is_sorted)"
+             " is fasle or a N-D tensor when Attr(is_sorted) is true.");
    AddAttr<int>("dtype", "data type for output index");
    AddOutput("Out", "A unique subsequence for input tensor.");
    AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and int64 dtype.");
+              "Equivalent to inverse in numpy.unique, "
+              "the indices for where elements in the original input ended up "
+              "in the returned unique tensor.");
+    AddOutput(
+        "Indices",
+        "The indices of the input tensor that result in the unique tensor.")
+        .AsDispensable();
+    AddOutput("Counts", "The counts for each unique element.").AsDispensable();
+    AddAttr<bool>("return_index",
+                  "If True, also return the indices of the input"
+                  " tensor that result in the unique Tensor.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "return_inverse",
+        "If True, also return the indices for where elements"
+        " in the original input ended up in the returned unique tensor.")
+        .SetDefault(false);
+    AddAttr<bool>("return_counts",
+                  "If True, also return the counts for each unique element.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "axis",
+        "The axis to apply unique. If None, the input will be flattened.")
+        .SetDefault({});
+    AddAttr<bool>("is_sorted",
+                  "If True, the unique elements of X are in ascending order."
+                  "Otherwise, the unique elements are not sorted.")
+        .SetDefault(false);
    AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence
+    1. Return a unique subsequence for 1-D input tensor, and an index tensor
+    pointing to this unique subsequence when Attr(is_sorted) is false. This 
+    means paddle.unique is called.
+    
+    2. Returns the unique elements of X in ascending order when Attr(is_sorted)
+    is true. This means fluid.layers.unique is called.
 )DOC");
  }
 };
@@ -65,6 +144,8 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {

 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
-REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel<float>,
-                       ops::UniqueKernel<double>, ops::UniqueKernel<int32_t>,
-                       ops::UniqueKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <algorithm>
 #include <cmath>
+#include <numeric>
+#include <set>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/transpose_op.h"

 namespace paddle {
 namespace operators {
@@ -104,17 +109,243 @@ struct UniqueOpFunctor {
  }
 };

+static std::vector<framework::Tensor> Unbind(const framework::Tensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<framework::Tensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T>
+static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
+  if (a.numel() != b.numel()) {
+    return false;
+  }
+  for (int64_t i = 0; i < a.numel(); ++i) {
+    if (a.data<T>()[i] != b.data<T>()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 template <typename T>
+static void UniqueFlattendTensor(const framework::ExecutionContext& context,
+                                 const framework::Tensor& in,
+                                 framework::Tensor* out, bool return_index,
+                                 bool return_inverse, bool return_counts) {
+  const T* in_data = in.data<T>();
+  std::set<T> unique(in_data, in_data + in.numel());
+  out->Resize(framework::make_ddim({static_cast<int64_t>(unique.size())}));
+  auto out_data = out->mutable_data<T>(context.GetPlace());
+  std::copy(unique.begin(), unique.end(), out_data);
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    indices->Resize(framework::make_ddim({out->numel()}));
+    auto indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> indices_map;
+    indices_map.reserve(out->numel());
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      if (indices_map.find(in_data[i]) != indices_map.end()) continue;
+      indices_map[in_data[i]] = i;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      indices_data[i] = indices_map[out_data[i]];
+    }
+  }
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    inverse->Resize(framework::make_ddim({in.numel()}));
+    auto inverse_data = inverse->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> inverse_map;
+    inverse_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      inverse_map[out_data[i]] = i;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      inverse_data[i] = inverse_map[in_data[i]];
+    }
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    count->Resize(framework::make_ddim({out->numel()}));
+    auto count_data = count->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> counts_map;
+    counts_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      counts_map[out_data[i]] = 0;
+    }
+    for (int64_t i = 0; i < in.numel(); i++) {
+      counts_map[in_data[i]] += 1;
+    }
+    for (int64_t i = 0; i < out->numel(); i++) {
+      count_data[i] = counts_map[out_data[i]];
+    }
+  }
+}
+
+template <class ForwardIt, typename T>
+static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
+                               ForwardIt first, ForwardIt last,
+                               const std::vector<int64_t>& sorted_indices_vec,
+                               std::vector<int64_t>* inverse_vec,
+                               std::vector<int64_t>* counts_vec,
+                               std::vector<int64_t>* indices_vec) {
+  if (first == last) {
+    return last;
+  }
+
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+  (*indices_vec)[0] = sorted_indices_vec[0];
+
+  ForwardIt begin = first;
+  ForwardIt result = first;
+
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!Equal<T>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+      (*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+
+template <typename DeviceContext, typename T>
+static void UniqueDim(const framework::ExecutionContext& context,
+                      const framework::Tensor& in, framework::Tensor* out,
+                      bool return_index, bool return_inverse,
+                      bool return_counts, int axis) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(framework::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  framework::Tensor in_trans;
+  framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  in_trans.mutable_data<T>(context.GetPlace());
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+  TransCompute<DeviceContext, T>(in.dims().size(), dev_ctx, in, &in_trans,
+                                 permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  framework::DDim in_trans_flat_dims =
+      framework::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  // sort indices
+  std::vector<int64_t> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const T* in_trans_data = in_trans.data<T>();
+  std::sort(sorted_indices_vec.begin(), sorted_indices_vec.end(),
+            [&](int64_t a, int64_t b) -> bool {
+              for (int64_t i = 0; i < col; ++i) {
+                T lhs = in_trans_data[i + a * col];
+                T rhs = in_trans_data[i + b * col];
+                if (lhs < rhs) {
+                  return true;
+                } else if (lhs > rhs) {
+                  return false;
+                }
+              }
+              return false;
+            });
+
+  // sort tensor according to indices
+  framework::Tensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  input_sorted.mutable_data<T>(context.GetPlace());
+  T* input_sorted_data = input_sorted.data<T>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + sorted_indices_vec[i] * col, col * sizeof(T));
+  }
+
+  std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
+  std::vector<int64_t> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<int64_t> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<int64_t> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, T>(
+      context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec,
+      &inverse_vec, &counts_vec, &indices_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(),
+                    indices_vec.end());
+
+  math::ConcatFunctor<DeviceContext, T> concat_functor;
+  framework::Tensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(framework::make_ddim(out_trans_dims_vec));
+  out_trans.mutable_data<T>(context.GetPlace());
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(framework::make_ddim(out_trans_dims_vec));
+  out->mutable_data<T>(context.GetPlace());
+  concat_functor(dev_ctx, input_unbind, 0, &out_trans);
+  TransCompute<DeviceContext, T>(out_trans.dims().size(), dev_ctx, out_trans,
+                                 out, permute);
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    framework::TensorFromVector(inverse_vec, context.device_context(), inverse);
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    framework::TensorFromVector(counts_vec, context.device_context(), count);
+  }
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    framework::TensorFromVector(indices_vec, context.device_context(), indices);
+  }
+}
+
+template <typename DeviceContext, typename T>
 class UniqueKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
    auto* x = context.Input<framework::Tensor>("X");
    auto* out = context.Output<framework::Tensor>("Out");
-    auto* index = context.Output<framework::Tensor>("Index");
+    if (!context.Attr<bool>("is_sorted")) {
+      auto data_type = static_cast<framework::proto::VarType::Type>(
+          context.Attr<int>("dtype"));
+      auto* index = context.Output<framework::Tensor>("Index");
+
+      framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+      return;
+    }

-    framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+    std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
+    bool return_index = context.Attr<bool>("return_index");
+    bool return_inverse = context.Attr<bool>("return_inverse");
+    bool return_counts = context.Attr<bool>("return_counts");
+
+    if (axis_vec.empty()) {
+      UniqueFlattendTensor<T>(context, *x, out, return_index, return_inverse,
+                              return_counts);
+    } else {
+      int axis = axis_vec[0];
+      UniqueDim<DeviceContext, T>(context, *x, out, return_index,
+                                  return_inverse, return_counts, axis);
+    }
  }
 };


--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -62,6 +62,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
    {"sync_batch_norm",
     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
      "ReserveSpace"}},
+    {"unique", {"Out", "Index", "Indices", "Counts"}},
 };

 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -564,9 +565,9 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,

  if (!is_gpu_tensor && !is_xpu_tensor) {
    if (!need_deep_copy) {
-      return py::array(py::buffer_info(
-          const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
-          static_cast<size_t>(tensor.dims().size()), py_dims, py_strides));
+      auto base = py::cast(std::move(tensor));
+      return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
+                       const_cast<void *>(tensor_buf_ptr), base);
    } else {
      py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
      PADDLE_ENFORCE_EQ(

--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,6 +29,8 @@ function(train_test TARGET_NAME)
                PROPERTIES DEPENDS test_${TARGET_NAME})
        set_tests_properties(test_train_${TARGET_NAME}${arg}
                PROPERTIES LABELS "RUN_TYPE=DIST")
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES TIMEOUT 150)
    endforeach()
 endfunction(train_test)


--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -70,7 +70,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
 | `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |

 ## Docker Images

@@ -155,21 +154,6 @@ docker push
 kubectl ...
 ```

-### Reading source code with woboq codebrowser
-
-For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
-
- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
-
-```bash
-./paddle/scripts/paddle_docker_build.sh html
-```
-
- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
-
-```
-docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
-```

 ## More Options


--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -529,13 +529,16 @@ EOF
            pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        fi
        ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2
+        ctest --output-on-failure -j $2;mactest_error=$?
        ut_endTime_s=`date +%s`
        echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
        paddle version
        # Recovery proxy to avoid failure in later steps
        export http_proxy=$my_proxy
        export https_proxy=$my_proxy
+        if [ "$mactest_error" != 0 ];then
+            exit 8;
+        fi
    fi
 }

@@ -1105,22 +1108,6 @@ EOF
      esac
 }

-function gen_html() {
-    cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-    export WOBOQ_OUT=${PADDLE_ROOT}/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-    	-b ${PADDLE_ROOT}/build \
-    	-a \
-    	-o $WOBOQ_OUT \
-    	-p paddle:${PADDLE_ROOT}
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-}

 function gen_dockerfile() {
    # Set BASE_IMAGE according to env variables
@@ -1443,9 +1430,6 @@ function main() {
      gen_doc_lib)
        gen_doc_lib $2
        ;;
-      html)
-        gen_html
-        ;;
      dockerfile)
        gen_dockerfile ${PYTHON_ABI:-""}
        ;;

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -14,7 +14,7 @@

 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
-from paddle.fluid.framework import Variable
+from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format


@@ -810,6 +810,68 @@ class DistributedStrategy(object):
        else:
            print("WARNING: auto should have value of bool type")

+    @property
+    def cudnn_exhaustive_search(self):
+        return self.strategy.cudnn_exhaustive_search
+
+    @cudnn_exhaustive_search.setter
+    def cudnn_exhaustive_search(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_exhaustive_search = flag
+        else:
+            print(
+                "WARNING: cudnn_exhaustive_search should have value of bool type"
+            )
+
+    @property
+    def conv_workspace_size_limit(self):
+        return self.strategy.conv_workspace_size_limit
+
+    @conv_workspace_size_limit.setter
+    def conv_workspace_size_limit(self, value):
+        if isinstance(value, int):
+            self.strategy.conv_workspace_size_limit = value
+        else:
+            print(
+                "WARNING: conv_workspace_size_limit should have value of int type"
+            )
+
+    @property
+    def cudnn_batchnorm_spatial_persistent(self):
+        return self.strategy.cudnn_batchnorm_spatial_persistent
+
+    @cudnn_batchnorm_spatial_persistent.setter
+    def cudnn_batchnorm_spatial_persistent(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_batchnorm_spatial_persistent = flag
+        else:
+            print(
+                "WARNING: cudnn_batchnorm_spatial_persistent should have value of bool type"
+            )
+
+    def _enable_env(self):
+        strategy = self.strategy
+        keys = [
+            "FLAGS_cudnn_batchnorm_spatial_persistent",
+            "FLAGS_conv_workspace_size_limit",
+            "FLAGS_cudnn_exhaustive_search",
+            "FLAGS_sync_nccl_allreduce",
+            "FLAGS_fuse_parameter_memory_size",
+            "FLAGS_fuse_parameter_groups_size",
+        ]
+        values = [
+            bool(strategy.cudnn_batchnorm_spatial_persistent),
+            int(strategy.conv_workspace_size_limit),
+            bool(strategy.cudnn_exhaustive_search),
+            bool(strategy.sync_nccl_allreduce),
+            int(strategy.fuse_grad_size_in_MB),
+            int(strategy.fuse_grad_size_in_TFLOPS),
+        ]
+
+        for i, key in enumerate(keys):
+            if core.globals().is_public(key):
+                core.globals()[key] = values[i]
+
    def __repr__(self):
        fields = self.strategy.DESCRIPTOR.fields
        for f in fields:

--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -383,6 +383,7 @@ class Fleet(object):
        context["valid_strategy"] = valid_strategy

        self.valid_strategy = valid_strategy
+        self.valid_strategy._enable_env()

        optimize_ops = []
        params_grads = []

--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -271,6 +271,6 @@ endforeach()

 # setting timeout value for old unittests
 if(NOT WIN32)
-	set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 200)
+	set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250)
 	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200)
 endif()
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -132,13 +132,16 @@ def _declarative_(dygraph_func):
    """
    Converts imperative dygraph APIs into declarative function APIs. Decorator
    @declarative handles the Program and Executor of static mode and returns
-    the result as a dygraph VarBase.
+    the result as dygraph Tensor(s). Users could use the returned dygraph
+    Tensor(s) to do imperative training, inference, or other operations. If the
+    decorated function calls other imperative function, the called one will be
+    converted into declarative function as well.

    Args:
        dygraph_func (callable): callable imperative function.

    Returns:
-        VarBase: containing the numerical result.
+        Tensor(s): containing the numerical result.

    Examples:
        .. code-block:: python
@@ -147,6 +150,7 @@ def _declarative_(dygraph_func):
          import numpy as np
          from paddle.fluid.dygraph.jit import declarative

+          fluid.enable_dygraph()

          @declarative
          def func(x):

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -110,7 +110,7 @@ def scope_guard(scope):
        _switch_scope(ex)


-def as_numpy(tensor):
+def as_numpy(tensor, copy=False):
    """
    Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
    For higher dimensional sequence data, please use LoDTensor directly.
@@ -129,6 +129,7 @@ def as_numpy(tensor):

    Args:
       tensor(Variable): a instance of Tensor
+       copy(bool, optional): Whether to use deep copy.

    Returns:
        numpy.ndarray
@@ -145,7 +146,10 @@ def as_numpy(tensor):
            Please set the parameter 'return_numpy' as 'False' to \
            return LoDTensor itself directly.")
    if tensor._is_initialized():
-        return np.array(tensor)
+        if copy:
+            return np.array(tensor)
+        else:
+            return np.asarray(tensor)
    else:
        return None

@@ -350,7 +354,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
        " program.")
    tensor = var.get_tensor()
    if return_numpy:
-        tensor = as_numpy(tensor)
+        tensor = as_numpy(tensor, copy=True)
    return tensor



--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -522,9 +522,15 @@ if(NOT WIN32)
 endif()

 if(NOT APPLE AND NOT WIN32)
-    parallel_bash_test_modules(test_acp START_BASH parallel_test.sh TIMEOUT 140 UnitTests test_auto_checkpoint test_auto_checkpoint1 test_auto_checkpoint2 test_auto_checkpoint3)
-    parallel_bash_test_modules(test_acp2 START_BASH parallel_test.sh TIMEOUT 140 UnitTests test_auto_checkpoint_multiple test_auto_checkpoint_dist_basic)
-    parallel_bash_test_modules(test_hdfs START_BASH parallel_test.sh TIMEOUT 120 UnitTests test_hdfs1 test_hdfs2 test_hdfs3)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
 endif()

 add_subdirectory(sequence)

--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestBatchNorm(unittest.TestCase):
+    def test_name(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                batch_norm1d = paddle.nn.BatchNorm1d(1, name="test")
+
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            #paddle.disable_static()
+            x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+            x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2d(1)
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3d(1)
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -294,6 +294,28 @@ class TestStrategyConfig(unittest.TestCase):
        with self.assertRaises(TypeError):
            strategy.unknown_key = 'UNK'

+    def test_cudnn_exhaustive_search(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_exhaustive_search = False
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+        strategy.cudnn_exhaustive_search = "True"
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+
+    def test_cudnn_batchnorm_spatial_persistent(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_batchnorm_spatial_persistent = False
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+        strategy.cudnn_batchnorm_spatial_persistent = "True"
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+
+    def test_conv_workspace_size_limit(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.conv_workspace_size_limit = 1000
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy.conv_workspace_size_limit = "400"
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy._enable_env()
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphGroupNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestInstanceNorm(unittest.TestCase):
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm1d = paddle.nn.InstanceNorm1d(1)
+                instance_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                instance_norm2d = paddle.nn.InstanceNorm2d(1)
+                instance_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(1)
+                instance_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.InstanceNorm(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.InstanceNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ins = fluid.dygraph.InstanceNorm(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ins = paddle.nn.InstanceNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphLayerNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -44,8 +44,8 @@ class TestMultiplyAPI(unittest.TestCase):

    def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
        paddle.disable_static()
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
        res = paddle.multiply(x, y, axis=axis)
        return res.numpy()

@@ -126,17 +126,31 @@ class TestMultiplyError(unittest.TestCase):
        paddle.disable_static()
        x_data = np.random.randn(200).astype(np.int8)
        y_data = np.random.randn(200).astype(np.int8)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)

        # test dynamic computation graph: inputs must be broadcastable
        x_data = np.random.rand(200, 5)
        y_data = np.random.rand(200)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)

+        # test dynamic computation graph: inputs must be broadcastable(python)
+        x_data = np.random.rand(200, 5)
+        y_data = np.random.rand(200)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+
+        # test dynamic computation graph: dtype must be same
+        x_data = np.random.randn(200).astype(np.int64)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(TypeError, paddle.multiply, x, y)
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+
+def numpy_topk(x, k=1, axis=-1, largest=True):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if largest:
+        indices = np.argsort(-x, axis=axis)
+    else:
+        indices = np.argsort(x, axis=axis)
+    if largest:
+        value = -np.sort(-x, axis=axis)
+    else:
+        value = np.sort(x, axis=axis)
+    indices = indices.take(indices=range(0, k), axis=axis)
+    value = value.take(indices=range(0, k), axis=axis)
+    return value, indices
+
+
+class TestTopkOp(OpTest):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 20)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestTopOp1(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = True
+
+
+class TestTopOp2(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp3(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp4(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkOp5(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp6(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopKAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.input_data = np.random.rand(6, 7, 8)
+        self.large_input_data = np.random.rand(2, 1030)
+
+    def run_dygraph(self, place):
+        paddle.disable_static(place)
+        input_tensor = paddle.to_tensor(self.input_data)
+        large_input_tensor = paddle.to_tensor(self.large_input_data)
+        # test case for basic test case 1
+        paddle_result = paddle.topk(input_tensor, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 2 with axis
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 3 with tensor K
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 4 with tensor largest
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 5 with axis -1
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 6 for the partial sort 
+        paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
+        numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 7 for the unsorted 
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+        sort_paddle = numpy_topk(
+            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 7, 8], dtype="float64")
+            large_input_tensor = paddle.static.data(
+                name="large_x", shape=[2, 1030], dtype="float64")
+            k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
+            result1 = paddle.topk(input_tensor, k=2)
+            result2 = paddle.topk(input_tensor, k=2, axis=-1)
+            result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+            result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+            result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
+            result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(10, 20).astype("float64")
+            large_input_data = np.random.rand(2, 100).astype("float64")
+            paddle_result = exe.run(
+                feed={
+                    "x": self.input_data,
+                    "large_x": self.large_input_data,
+                    "k": np.array([2]).astype("int32")
+                },
+                fetch_list=[
+                    result1[0], result1[1], result2[0], result2[1], result3[0],
+                    result3[1], result4[0], result4[1], result5[0], result5[1],
+                    result6[0], result6[1], result7[0], result7[1]
+                ])
+            numpy_result = numpy_topk(self.input_data, k=2)
+            self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[2], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[3], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=-1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
+            numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[10], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[11], numpy_result[1]))
+            sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2)
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def test_cases(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.run_dygraph(place)
+            self.run_static(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -125,5 +126,164 @@ class TestRandomGPU(TestUniqueOp):
            self.check_output_with_place(place, atol=1e-5)


+class TestSortedUniqueOp(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64')}
+        unique, indices, inverse, count = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": count,
+        }
+
+
+class TestUniqueOpAxisNone(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((4, 7, 10)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueOpAxis1(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((3, 8, 8)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=1)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": [1],
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueAPI(unittest.TestCase):
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        x_data = x_data = np.random.randint(0, 10, (120))
+        x = paddle.to_tensor(x_data)
+        out = paddle.unique(x)
+        expected_out = np.unique(x_data)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        paddle.enable_static()
+
+    def test_dygraph_api_attr(self):
+        paddle.disable_static()
+        x_data = np.random.random((3, 5, 5)).astype("float32")
+        x = paddle.to_tensor(x_data)
+        out, index, inverse, counts = paddle.unique(
+            x,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        np_out, np_index, np_inverse, np_counts = np.unique(
+            x_data,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        self.assertTrue((out.numpy() == np_out).all(), True)
+        self.assertTrue((index.numpy() == np_index).all(), True)
+        self.assertTrue((inverse.numpy() == np_inverse).all(), True)
+        self.assertTrue((counts.numpy() == np_counts).all(), True)
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data(name='x', shape=[3, 2], dtype='float64')
+            unique, inverse, counts = paddle.unique(
+                x, return_inverse=True, return_counts=True, axis=0)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [1, 2]]).astype('float64')
+            result = exe.run(feed={"x": x_np},
+                             fetch_list=[unique, inverse, counts])
+        np_unique, np_inverse, np_counts = np.unique(
+            x_np, return_inverse=True, return_counts=True, axis=0)
+        self.assertTrue(np.allclose(result[0], np_unique))
+        self.assertTrue(np.allclose(result[1], np_inverse))
+        self.assertTrue(np.allclose(result[2], np_counts))
+
+
+class TestUniqueError(unittest.TestCase):
+    def test_input_dtype(self):
+        def test_x_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                result = paddle.unique(x)
+
+            self.assertRaises(TypeError, test_x_dtype)
+
+    def test_attr(self):
+        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+
+        def test_return_index():
+            result = paddle.unique(x, return_index=0)
+
+        self.assertRaises(TypeError, test_return_index)
+
+        def test_return_inverse():
+            result = paddle.unique(x, return_inverse='s')
+
+        self.assertRaises(TypeError, test_return_inverse)
+
+        def test_return_counts():
+            result = paddle.unique(x, return_counts=3)
+
+        self.assertRaises(TypeError, test_return_counts)
+
+        def test_axis():
+            result = paddle.unique(x, axis='12')
+
+        self.assertRaises(TypeError, test_axis)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,17 +26,22 @@ from collections import Iterable

 import paddle
 from paddle import fluid
+from paddle.fluid import core
+from paddle.fluid.framework import in_dygraph_mode, Variable, ParamBase, _current_expected_place
 # Note: Use alias `Input` temporarily before releasing hapi feature.
 from paddle.static import InputSpec as Input
-from paddle.fluid.framework import in_dygraph_mode, Variable
 from paddle.fluid.executor import global_scope
 from paddle.fluid.io import is_belong_to_optimizer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
+from paddle.fluid.executor import scope_guard, Executor
 from paddle.io import DataLoader, Dataset
+
+from paddle.fluid.dygraph.layers import Layer
 from paddle.metric import Metric

 from .distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
@@ -846,24 +851,32 @@ class Model(object):
        """
        return self._adapter.test_batch(inputs)

-    def save(self, path):
-        """
-        This function saves parameters, optimizer infomation to path.
+    def save(self, path, training=True):
+        """  
+        This function saves parameters, optimizer information or model and 
+        paramters only for inference to path. It depends on the parameter
+        `training`.

-        The parameters contains all the trainable Variable, will save to
-        a file with suffix ".pdparams".
+        If `training` is set to True, the parameters saved contain all 
+        the trainable Variable, will save to a file with suffix ".pdparams".
        The optimizer information contains all the variable used by optimizer.
        For Adam optimizer, contains beta1, beta2, momentum etc. All the
        information will save to a file with suffix ".pdopt". (If the optimizer
        have no variable need to save (like SGD), the fill will not generated).
+        This function will silently overwrite existing file at the target location.

-        This function will silently overwrite existing file
-        at the target location.
+        If `training` is set to False, only inference model will be saved. It 
+        should be noted that before using `save`, you should run the model, and 
+        the shape of input you saved is as same as the input of its running.
+        `@paddle.jit.to_static` must be added on `forward` function of your layer 
+        in dynamic mode now and these will be optimized later.

        Args:
            path (str): The file prefix to save model. The format is
                'dirname/file_prefix' or 'file_prefix'. if empty str. A exception
                 will be raised.
+            training (bool, optional): Whether to save for training. If not, save
+                for inference only. Default: True.

        Returns:
            None
@@ -871,25 +884,47 @@ class Model(object):
        Examples:

            .. code-block:: python
-            
-              import paddle
-              import paddle.incubate.hapi as hapi
-              
-              class MyNet(paddle.nn.Layer):
-                  def __init__(self):
-                      super(MyNet, self).__init__()
-                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
+                import paddle
+                import paddle.incubate.hapi as hapi
+                from paddle.nn import Linear
+                from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+                class Mnist(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyNet, self).__init__()
+                        self._fc = Linear(784, 1, act='softmax')
+
+                  @paddle.jit.to_static # If save for inference in dygraph, need this
                  def forward(self, x):
                      y = self._fc(x)
                      return y
-              
-              device = hapi.set_device('cpu')
-              paddle.disable_static(device)
-              model = hapi.Model(MyNet())
-              model.save('checkpoint/test')
+
+                dynamic = True # False
+                device = hapi.set_device('cpu')
+                # if use static graph, do not set
+                paddle.disable_static(device) if dynamic else None
+
+                # inputs and labels are not required for dynamic graph.
+                input = hapi.Input([None, 784], 'float32', 'x')
+                label = hapi.Input([None, 1], 'int64', 'label')
+
+                model = hapi.Model(Mnist(), input, label)
+                optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                    parameter_list=model.parameters())
+                model.prepare(optim,
+                                paddle.nn.CrossEntropyLoss(),
+                                hapi.metrics.Accuracy())
+                mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+                model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
+                model.save('checkpoint/test') # save for training
+                model.save('inference_model', False) # save for inference
        """
+
        if ParallelEnv().local_rank == 0:
-            self._adapter.save(path)
+            if not training:
+                self._save_inference_model(path)
+            else:
+                self._adapter.save(path)

    def load(self, path, skip_mismatch=False, reset_optimizer=False):
        """
@@ -1474,13 +1509,17 @@ class Model(object):
        cbks.on_end('test', logs)
        return outputs

-    def save_inference_model(self,
-                             save_dir,
-                             model_filename=None,
-                             params_filename=None,
-                             model_only=False):
+    def _save_inference_model(self,
+                              save_dir,
+                              model_filename=None,
+                              params_filename=None,
+                              model_only=False):
        """
-        Save inference model must in static mode.
+        Save inference model can be in static or dynamic mode.
+        It should be noted that before using `save_inference_model`, you should
+        run the model, and the shape you saved is as same as the input of its
+        running. `@paddle.jit.to_static` must be added on `forward` function of
+        your layer in dynamic mode now and these will be optimized later.

        Args:
            save_dir (str): The directory path to save the inference model.
@@ -1496,39 +1535,145 @@ class Model(object):
        Returns:
            list: The fetch variables' name list

-
        Examples:
        .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.static import InputSpec

            import paddle.incubate.hapi as hapi
-
-            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
-            model = hapi.Model(hapi.vision.LeNet(), input)
-            model.prepare()
-
+            from paddle.nn import Linear
+            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+            class Mnist(Layer):
+                def __init__(self, classifier_act=None):
+                    super(Mnist, self).__init__()
+
+                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
+
+                @paddle.jit.to_static # In static mode, you need to delete this.
+                def forward(self, inputs):
+                    outputs = self.fc(inputs)
+                    return outputs
+
+            dynamic = True # False
+            device = hapi.set_device('gpu')
+
+            # if use static graph, do not set
+            paddle.disable_static(device) if dynamic else None
+
+            # inputs and labels are not required for dynamic graph.
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
+
+            model = hapi.Model(Mnist(), input, label)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                parameter_list=model.parameters())
+            model.prepare(optim,
+                            paddle.nn.CrossEntropyLoss(),
+                            hapi.metrics.Accuracy())
+            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
            model.save_inference_model('inference_model')
        """
-        assert not fluid.in_dygraph_mode(
-        ), 'Save inference model must in static mode!'

-        prog = self._adapter._progs.get('test', None)
-        assert prog, \
-            "Model is not ready, please call `model.prepare()` first"
+        def get_inout_spec(all_vars, return_name=False):
+            result_list = []
+            valid_vars = [var for var in all_vars if isinstance(var, Variable)]
+            result_list = valid_vars
+            if return_name:
+                result_list = [var.name for var in result_list]

-        infer_prog = prog.clone(for_test=True)
+            return result_list

-        input_names = [v.name for v in self._adapter._input_vars['test']]
-        endpoints = self._adapter._endpoints['test']['output']
+        # TODO:
+        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
+        # 2. Save correct shape of input, now the interface stores the shape that the user sent to 
+        #    the inputs of the model in running.
+        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
+        if fluid.in_dygraph_mode():
+            layer = self.network
+            fluid.disable_dygraph()
+
+            # 1. input check
+            prog_translator = ProgramTranslator()
+            if not prog_translator.enable_declarative:
+                raise RuntimeError(
+                    "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                )
+            if not isinstance(layer, Layer):
+                raise TypeError(
+                    "The input layer should be 'Layer', but received layer type is %s."
+                    % type(layer))
+
+            # 2. get program of declarative Layer.forward
+            prog_cache = prog_translator.get_program_cache()
+            # make dummy args & kwargs, to get excepted FunctionSpec
+            layer_func = FunctionSpec(type(layer).forward, [layer], {})
+            concrete_program, _ = prog_cache.get_program(layer_func)
+
+            # NOTE: we maintain the mapping of variable name to
+            # structured name, the buffer variable (non-persistable)
+            # saved to inference program may not need by dygraph Layer,
+            # we only record the state_dict variable's structured name
+            state_names_dict = dict()
+            for structured_name, var in layer.state_dict().items():
+                state_names_dict[var.name] = structured_name
+
+            # 3. share parameters from Layer to scope & record var info
+            scope = core.Scope()
+            extra_var_info = dict()
+            for param_or_buffer in concrete_program.parameters:
+                # share to scope
+                param_or_buffer_tensor = scope.var(
+                    param_or_buffer.name).get_tensor()
+                src_tensor = param_or_buffer.value().get_tensor()
+                param_or_buffer_tensor._share_data_with(src_tensor)
+                # record var info
+                extra_info_dict = dict()
+                if param_or_buffer.name in state_names_dict:
+                    extra_info_dict['structured_name'] = state_names_dict[
+                        param_or_buffer.name]
+                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
+                if isinstance(param_or_buffer, ParamBase):
+                    extra_info_dict['trainable'] = param_or_buffer.trainable
+                extra_var_info[param_or_buffer.name] = extra_info_dict
+
+            # 4. build input & output spec
+            input_var_names = get_inout_spec(concrete_program.inputs, True)
+            output_vars = get_inout_spec(concrete_program.outputs)
+
+            # 5. save inference model
+            with scope_guard(scope):
+                return fluid.io.save_inference_model(
+                    dirname=save_dir,
+                    feeded_var_names=input_var_names,
+                    target_vars=output_vars,
+                    executor=Executor(_current_expected_place()),
+                    main_program=concrete_program.main_program.clone(),
+                    model_filename=model_filename,
+                    params_filename=params_filename,
+                    program_only=model_only)

-        return fluid.io.save_inference_model(
-            save_dir,
-            input_names,
-            endpoints,
-            self._adapter._executor,
-            main_program=infer_prog,
-            model_filename=model_filename,
-            params_filename=params_filename,
-            program_only=model_only)
+        else:
+            prog = self._adapter._progs.get('test', None)
+            assert prog, \
+                "Model is not ready, please call `model.prepare()` first"
+
+            infer_prog = prog.clone(for_test=True)
+
+            input_names = [v.name for v in self._adapter._input_vars['test']]
+            endpoints = self._adapter._endpoints['test']['output']
+
+            return fluid.io.save_inference_model(
+                save_dir,
+                input_names,
+                endpoints,
+                self._adapter._executor,
+                main_program=infer_prog,
+                model_filename=model_filename,
+                params_filename=params_filename,
+                program_only=model_only)

    def _run_one_epoch(self, data_loader, callbacks, mode, logs={}):
        outputs = []

--- a/python/paddle/incubate/hapi/tests/CMakeLists.txt
+++ b/python/paddle/incubate/hapi/tests/CMakeLists.txt
@@ -12,6 +12,7 @@ endforeach()
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)
 endforeach()
+set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 150) 


 function(py_dist_test TARGET_NAME)

--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
@@ -33,6 +33,8 @@ from paddle.metric import Accuracy
 from paddle.incubate.hapi.datasets import MNIST
 from paddle.incubate.hapi.vision.models import LeNet
 from paddle.incubate.hapi.distributed import DistributedBatchSampler, prepare_distributed_context
+from paddle.fluid.dygraph.jit import declarative
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator


 class LeNetDygraph(fluid.dygraph.Layer):
@@ -65,6 +67,37 @@ class LeNetDygraph(fluid.dygraph.Layer):
        return x


+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation=None):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120),
+                Linear(120, 84),
+                Linear(
+                    84, 10, act=classifier_activation))
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
    def __init__(self, mode, return_label=True, sample_num=None):
        super(MnistDataset, self).__init__(mode=mode)
@@ -335,7 +368,6 @@ class TestModelFunction(unittest.TestCase):
            model = Model(net, inputs, labels)
            model.prepare(optim2, loss=CrossEntropyLoss(reduction="sum"))
            loss, = model.train_batch([data], [label])
-
            np.testing.assert_allclose(loss.flatten(), ref.flatten())
            fluid.disable_dygraph() if dynamic else None

@@ -445,33 +477,38 @@ class TestModelFunction(unittest.TestCase):
            fluid.disable_dygraph() if dynamic else None

    def test_export_deploy_model(self):
-        net = LeNet()
-        inputs = [Input([-1, 1, 28, 28], 'float32', 'image')]
-        model = Model(net, inputs)
-        model.prepare()
-        save_dir = tempfile.mkdtemp()
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        tensor_img = np.array(
-            np.random.random((1, 1, 28, 28)), dtype=np.float32)
-        ori_results = model.test_batch(tensor_img)
-
-        model.save_inference_model(save_dir)
-
-        place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=save_dir, executor=exe))
-
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
+        for dynamic in [True, False]:
+            fluid.enable_dygraph() if dynamic else None
+            # paddle.disable_static() if dynamic else None
+            prog_translator = ProgramTranslator()
+            prog_translator.enable(False) if not dynamic else None
+            net = LeNetDeclarative()
+            inputs = [Input([None, 1, 28, 28], 'float32', 'x')]
+            model = Model(net, inputs)
+            model.prepare()
+            save_dir = tempfile.mkdtemp()
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            tensor_img = np.array(
+                np.random.random((1, 1, 28, 28)), dtype=np.float32)
+            ori_results = model.test_batch(tensor_img)
+            model.save(save_dir, training=False)
+            fluid.disable_dygraph() if dynamic else None

-        np.testing.assert_allclose(results, ori_results, rtol=1e-6)
-        shutil.rmtree(save_dir)
+            place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            new_scope = fluid.Scope()
+            with fluid.scope_guard(new_scope):
+                exe = fluid.Executor(place)
+                [inference_program, feed_target_names, fetch_targets] = (
+                    fluid.io.load_inference_model(
+                        dirname=save_dir, executor=exe))
+                results = exe.run(inference_program,
+                                  feed={feed_target_names[0]: tensor_img},
+                                  fetch_list=fetch_targets)
+                np.testing.assert_allclose(
+                    results, ori_results, rtol=1e-5, atol=1e-7)
+                shutil.rmtree(save_dir)


 class TestRaiseError(unittest.TestCase):

--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -128,6 +128,12 @@ from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm1d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm2d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS

--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -160,12 +160,12 @@ from .loss import square_error_cost  #DEFINE_ALIAS
 from .loss import ssd_loss  #DEFINE_ALIAS
 from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 from .loss import ctc_loss  #DEFINE_ALIAS
-# from .norm import batch_norm        #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
 # from .norm import group_norm        #DEFINE_ALIAS
-# from .norm import instance_norm        #DEFINE_ALIAS
 from .norm import l2_normalize  #DEFINE_ALIAS
-# from .norm import layer_norm        #DEFINE_ALIAS
+from .norm import batch_norm  #DEFINE_ALIAS
+from .norm import instance_norm  #DEFINE_ALIAS
+from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS

--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -18,16 +18,19 @@ import paddle.fluid as fluid
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, core
+from ...framework import create_parameter
 from ...fluid.layers import l2_normalize  #DEFINE_ALIAS
 from ...fluid.layers import lrn  #DEFINE_ALIAS
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid import core, dygraph_utils

 __all__ = [
-    #       'batch_norm',
+    'batch_norm',
    #       'data_norm',
-    #       'group_norm',
-    #       'instance_norm',
+    'instance_norm',
    'l2_normalize',
-    #       'layer_norm',
+    'layer_norm',
    'lrn',
    'normalize',
    #       'spectral_norm'
@@ -110,3 +113,286 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
    eps = out.block.create_var(dtype=out.dtype)
    paddle.fill_constant([1], out.dtype, epsilon, out=eps)
    return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name)
+
+
+def batch_norm(x,
+               running_mean,
+               running_var,
+               weight,
+               bias,
+               training=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               data_format="NCHW",
+               name=None):
+    """
+    Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
+    
+    Parameters:
+        x(Tesnor): input value. It's data type should be float32, float64.
+        running_mean(Tensor): running mean.
+        running_var(Tensor): running variance.
+        weight(Tensor): The weight tensor of batch_norm, can not be None.
+        bias(Tensor): The bias tensor of batch_norm can not be None. 
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          x = np.random.seed(123)
+          x = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          running_mean = np.random.random(size=1).astype('float32')
+          running_variance = np.random.random(size=1).astype('float32')
+          weight_data = np.random.random(size=1).astype('float32')
+          bias_data = np.random.random(size=1).astype('float32')
+          x = paddle.to_tensor(x)
+          rm = paddle.to_tensor(running_mean)
+          rv = paddle.to_tensor(running_variance)
+          w = paddle.to_tensor(weight_data)
+          b = paddle.to_tensor(bias_data)
+          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
+          print batch_norm_out
+    """
+
+    assert len(x.shape) >= 2, "input dim must be larger than 1"
+
+    # we use not training means use_global_status, more details see nn._BatchNormBase
+    use_global_stats = not training
+    # input ad out must share the memory
+    mean_out = running_mean
+    variance_out = running_var
+
+    if in_dygraph_mode():
+        # for dygraph need tuple
+        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
+                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
+                 "use_global_stats", use_global_stats)
+        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+            x, weight, bias, running_mean, running_var, mean_out, variance_out,
+            *attrs)
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'BatchNorm')
+
+    # for static need dict
+    attrs = {
+        "momentum": momentum,
+        "epsilon": epsilon,
+        "data_layout": data_format,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+
+    inputs = {
+        "X": [x],
+        "Scale": [weight],
+        "Bias": [bias],
+        "Mean": [running_mean],
+        "Variance": [running_var]
+    }
+
+    helper = LayerHelper('batch_norm', **locals())
+
+    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {
+        "Y": [batch_norm_out],
+        "MeanOut": [running_mean],
+        "VarianceOut": [running_var],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return helper.append_activation(batch_norm_out)
+
+
+def layer_norm(x,
+               normalized_shape,
+               weight=None,
+               bias=None,
+               epsilon=1e-05,
+               name=None):
+    """
+    see more detail in paddle.nn.LayerNorm
+    
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+    input_shape = list(x.shape)
+    input_ndim = len(input_shape)
+    normalized_ndim = len(normalized_shape)
+    begin_norm_axis = input_ndim - normalized_ndim
+    if input_ndim < normalized_ndim or input_shape[
+            begin_norm_axis:] != normalized_shape:
+        str_normalized_shape = str(normalized_shape)
+        raise ValueError('Given normalized_shape is ' + str_normalized_shape +
+                         ', expected input with shape [*, ' +
+                         str_normalized_shape[
+                             1:] + ', but got input shape ' + str(input_shape))
+
+    if in_dygraph_mode():
+        pre_act, _, _ = core.ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
+                                            'begin_norm_axis', begin_norm_axis)
+        return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm')
+
+    inputs = dict()
+    inputs['X'] = [x]
+    if weight:
+        inputs['Scale'] = [weight]
+    if bias:
+        inputs['Bias'] = [bias]
+    attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
+
+    # create output
+    helper = LayerHelper('layer_norm', **locals())
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(x.type)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
+def instance_norm(x,
+                  running_mean=None,
+                  running_var=None,
+                  weight=None,
+                  bias=None,
+                  use_input_stats=True,
+                  momentum=0.9,
+                  eps=1e-05,
+                  data_format="NCHW",
+                  name=None):
+    """
+    See more detail in nn.layer.InstanceNorm2d.
+
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        running_mean(Tensor): running mean. Default None.
+        running_var(Tensor): running variance. Default None.
+        weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
+        eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        use_input_stats(bool): Default True.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm_out = paddle.nn.functional.instancenorm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    if in_dygraph_mode():
+        out, _, _ = core.ops.instance_norm(x, weight, bias, "epsilon", eps,
+                                           "momentum", momentum, "data_format",
+                                           data_format)
+        return out
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
+
+    attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format}
+
+    if weight and bias:
+        inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
+    else:
+        inputs = {"X": [x]}
+
+    helper = LayerHelper('instance_norm', **locals())
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
+
+    outputs = {
+        "Y": [instance_norm_out],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    return instance_norm_out
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -41,6 +41,7 @@ from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.param_attr import ParamAttr
 from ...fluid.initializer import Constant
+from paddle.framework import get_default_dtype
 from .. import functional as F


@@ -423,7 +424,7 @@ class PReLU(layers.Layer):
            For more information, please refer to :ref:`api_guide_Name`.
    
    Shape:
-        - input: Tensor with any shape.
+        - input: Tensor with any shape. Default dtype is float32.
        - output: Tensor with the same shape as input.
    
    Examples:
@@ -433,13 +434,14 @@ class PReLU(layers.Layer):
            import numpy as np

            paddle.disable_static()
+            paddle.set_default_dtype("float64")

            data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
                            [ 3.0, -4.0,  5.0, -6.0],
                            [-7.0, -8.0,  8.0,  9.0]],
                            [[ 1.0, -2.0, -3.0,  4.0],
                            [-5.0,  6.0,  7.0, -8.0],
-                            [ 6.0,  7.0,  8.0,  9.0]]]], 'float32')
+                            [ 6.0,  7.0,  8.0,  9.0]]]], 'float64')
            x = paddle.to_tensor(data)
            m = paddle.nn.PReLU(1, 0.25)
            out = m(x)
@@ -461,10 +463,10 @@ class PReLU(layers.Layer):

        self._weight = self.create_parameter(
            attr=self._weight_attr,
-            shape=[num_parameters],
-            dtype='float32',
+            shape=[self._num_parameters],
+            dtype=get_default_dtype(),
            is_bias=False,
-            default_initializer=Constant(init))
+            default_initializer=Constant(self._init))

    def forward(self, x):
        return F.prelu(x, self._weight)

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/optimizer/lr_scheduler.py
+++ b/python/paddle/optimizer/lr_scheduler.py
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -27,7 +27,6 @@ from ..fluid.layers import expand_as  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
 from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
-from ..fluid.layers import unique  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS

 from ..fluid.layers import scatter_nd_add  #DEFINE_ALIAS
@@ -608,6 +607,126 @@ def squeeze(x, axis=None, name=None):
    return layers.squeeze(x, axis, name)


+def unique(x,
+           return_index=False,
+           return_inverse=False,
+           return_counts=False,
+           axis=None,
+           name=None):
+    """
+    Returns the unique elements of `x` in ascending order.
+
+    Args:
+        x(Tensor): The input tensor, it's data type should be float32, float64, int32, int64.
+        return_index(bool, optional): If True, also return the indices of the input tensor that
+            result in the unique Tensor.
+        return_inverse(bool, optional): If True, also return the indices for where elements in
+            the original input ended up in the returned unique tensor.
+        return_counts(bool, optional): If True, also return the counts for each unique element.
+        axis(int, optional): The axis to apply unique. If None, the input will be flattened.
+            Default: None.
+        name(str, optional): Name for the operation. For more information, please refer to
+            :ref:`api_guide_Name`. Default: None.
+
+    Returns: 
+        tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
+            provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \
+            is True. `counts` is provided only if `return_counts` is True.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            x_data = np.array([2, 3, 3, 1, 5, 3])
+            x = paddle.to_tensor(x_data)
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [1 2 3 5]
+            _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
+            np_indices = indices.numpy() # [3 0 1 4]
+            np_inverse = inverse.numpy() # [1 2 2 0 3 2]
+            np_counts = counts.numpy() # [1 1 3 1]
+
+            x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [0 1 2 3]
+
+            unique = paddle.unique(x, axis=0)
+            np_unique = unique.numpy() 
+            # [[2 1 3]
+            #  [3 0 1]]
+    """
+    if axis is None:
+        axis = []
+    else:
+        axis = [axis]
+
+    if in_dygraph_mode():
+        out, inverse, indices, counts = core.ops.unique(
+            x, 'dtype',
+            convert_np_dtype_to_dtype_('int32'), 'return_index', return_index,
+            'return_inverse', return_inverse, 'return_counts', return_counts,
+            'axis', axis, "is_sorted", True)
+        outs = [out]
+        if return_index:
+            outs.append(indices)
+        if return_inverse:
+            outs.append(inverse)
+        if return_counts:
+            outs.append(counts)
+
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    check_variable_and_dtype(x, "input",
+                             ['float32', 'float64', 'int32', 'int64'], 'unique')
+    check_type(return_index, 'return_index', bool, 'unique')
+    check_type(return_inverse, 'return_inverse', bool, 'unique')
+    check_type(return_counts, 'return_counts', bool, 'unique')
+    if len(axis) != 0:
+        check_type(axis[0], 'axis', int, 'unique')
+
+    helper = LayerHelper('unique', **locals())
+    attrs = {
+        'dtype': int(core.VarDesc.VarType.INT32),
+        "return_index": return_index,
+        "return_inverse": return_inverse,
+        "return_counts": return_counts,
+        "axis": axis,
+        "is_sorted": True
+    }
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    inverse = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+    outputs = {"Out": out, "Index": inverse}
+    outs = [out]
+    if return_index:
+        indices = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+        outputs["Indices"] = indices
+        outs.append(indices)
+    if return_inverse:
+        outs.append(inverse)
+    if return_counts:
+        counts = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+        outputs["Counts"] = counts
+        outs.append(counts)
+
+    helper.append_op(
+        type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs)
+
+    if len(outs) == 1:
+        return outs[0]
+
+    return tuple(outs)
+
+
 def unsqueeze(x, axis, name=None):
    """
 	:alias_main: paddle.unsqueeze

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -562,34 +562,52 @@ floor_mod = remainder  #DEFINE_ALIAS

 def multiply(x, y, axis=-1, name=None):
    """
-	:alias_main: paddle.multiply
-	:alias: paddle.multiply,paddle.tensor.multiply,paddle.tensor.math.multiply
+    multiply two tensors element-wise. The equation is:

-Examples:
+    .. math::
+        out = x * y

-    .. code-block:: python
+    **Note**:
+    ``paddle.multiply`` supports broadcasting. If you would like to know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .

-        import paddle
-        import numpy as np
+    Args:
+        x (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

-        paddle.disable_static()
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
-        res = paddle.multiply(x, y)
-        print(res.numpy()) # [[5, 12], [21, 32]]
+    Returns:
+        N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.

-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
-        res = paddle.multiply(x, y, axis=1)
-        print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
+    Examples:
+
+        ..  code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+            y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y)
+            print(res.numpy()) # [[5, 12], [21, 32]]
+
+            x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
+            y_data = np.array([1, 2], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y, axis=1)
+            print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]

    """
    op_type = 'elementwise_mul'
    act = None
+    if x.dtype != y.dtype:
+        raise TypeError(
+            'Input tensors must be same type, but received type of x: %s, type of y: %s '
+            % (x.dtype, y.dtype))
+
    if in_dygraph_mode():
        return _elementwise_op_in_dygraph(
            x, y, axis=axis, act=act, op_name=op_type)

--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -21,7 +21,6 @@ from ..fluid import core, layers
 from ..fluid.layers import argmin  #DEFINE_ALIAS
 from ..fluid.layers import has_inf  #DEFINE_ALIAS
 from ..fluid.layers import has_nan  #DEFINE_ALIAS
-from ..fluid.layers import topk  #DEFINE_ALIAS

 __all__ = [
    'argmax',
@@ -756,3 +755,100 @@ def masked_select(x, mask, name=None):
        type='masked_select', inputs={'X': x,
                                      'Mask': mask}, outputs={'Y': out})
    return out
+
+
+def topk(x, k, axis=None, largest=True, sorted=True, name=None):
+    """
+    This OP is used to find values and indices of the k largest or smallest at the optional axis.
+    If the input is a 1-D Tensor, finds the k largest or smallest values and indices.
+    If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`.
+
+    Args:
+        x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64.
+        k(int, Tensor): The number of top elements to look for along the axis.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is -1.
+        largest(bool, optional) : largest is a flag, if set to true,
+            algorithm will sort by descending order, otherwise sort by
+            ascending order. Default is True.
+        sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value. 
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
+
+    Examples:
+
+        .. code-block:: python
+
+           import numpy as np
+           import paddle
+
+           paddle.disable_static()
+
+           data_1 = np.array([1, 4, 5, 7])
+           tensor_1 = paddle.to_tensor(data_1)
+           value_1, indices_1 = paddle.topk(tensor_1, k=1)
+           print(value_1.numpy())
+           # [7]
+           print(indices_1.numpy())
+           # [3] 
+           data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
+           tensor_2 = paddle.to_tensor(data_2)
+           value_2, indices_2 = paddle.topk(tensor_2, k=1)
+           print(value_2.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_2.numpy())
+           # [[3]
+           #  [1]]
+           value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1)
+           print(value_3.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_3.numpy())
+           # [[3]
+           #  [1]]
+           value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0)
+           print(value_4.numpy())
+           # [[2 6 5 7]]
+           print(indices_4.numpy())
+           # [[1 1 0 0]]
+
+    """
+    if in_dygraph_mode():
+        k = k.numpy().item(0) if isinstance(k, Variable) else k
+        if axis is None:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'largest', largest,
+                                             'sorted', sorted)
+        else:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'axis', axis, 'largest',
+                                             largest, 'sorted', sorted)
+        return out, indices
+
+    helper = LayerHelper("top_k_v2", **locals())
+    inputs = {"X": [x]}
+    attrs = {}
+    if isinstance(k, Variable):
+        inputs['K'] = [k]
+    else:
+        attrs = {'k': k}
+    attrs['largest'] = largest
+    attrs['sorted'] = sorted
+    if axis is not None:
+        attrs['axis'] = axis
+
+    values = helper.create_variable_for_type_inference(dtype=x.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
+
+    helper.append_op(
+        type="top_k_v2",
+        inputs=inputs,
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs=attrs)
+    indices.stop_gradient = True
+    return values, indices
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -63,12 +63,12 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /o
    go get github.com/Masterminds/glide && \
    rm -rf /root/requirements.txt

-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32

 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz

--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -156,19 +156,19 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \

 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
+    pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
+    pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
+    pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
    pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.8 --no-cache-dir install opencv-python && \
+    pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
+    pip --no-cache-dir install opencv-python==4.2.0.32

 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort && \

--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -89,7 +89,7 @@ function do_cpython_build {
    fi
    # NOTE Make libpython shared library visible to python calls below
    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
-    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
    cd /
    ls ${MY_DIR}
    local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)

--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX

-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}

@@ -199,12 +198,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]


-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)

 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625

--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX

-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}

@@ -212,12 +211,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure] 


-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)

 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625

--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -480,14 +480,8 @@ def get_filenames():
                filename = ''
                print("\nWARNING:----Exception in get api filename----\n")
                print("\n" + api + ' module is ' + module + "\n")
-            if filename != '':
-                # rm contrib file
-                if filename.startswith(
-                        '../python/paddle/fluid/contrib'
-                ) or filename == '../python/paddle/verison.py':
-                    pass
-                elif filename not in filenames:
-                    filenames.append(filename)
+            if filename != '' and filename not in filenames:
+                filenames.append(filename)
            # get all methods
            method = ''
            if inspect.isclass(eval(api)):
@@ -557,14 +551,18 @@ def get_wlist():

    '''
    wlist = []
+    wlist_file = []
    with open("wlist.json", 'r') as load_f:
        load_dict = json.load(load_f)
        for key in load_dict:
-            wlist = wlist + load_dict[key]
-    return wlist
+            if key == 'wlist_file':
+                wlist_file = wlist_file + load_dict[key]
+            else:
+                wlist = wlist + load_dict[key]
+    return wlist, wlist_file


-wlist = get_wlist()
+wlist, wlist_file = get_wlist()

 if len(sys.argv) < 2:
    print("Error: inadequate number of arguments")
@@ -590,8 +588,14 @@ else:
    if len(filenames) == 0 and len(whl_error) == 0:
        print("-----API_PR.spec is the same as API_DEV.spec-----")
        exit(0)
-    elif '../python/paddle/fluid/core_avx.py' in filenames:
-        filenames.remove('../python/paddle/fluid/core_avx.py')
+    rm_file = []
+    for f in filenames:
+        for w_file in wlist_file:
+            if f.startswith(w_file):
+                rm_file.append(f)
+                filenames.remove(f)
+    if len(rm_file) != 0:
+        print("REMOVE white files: %s" % rm_file)
    print("API_PR is diff from API_DEV: %s" % filenames)
    one_part_filenum = int(math.ceil(len(filenames) / cpus))
    if one_part_filenum == 0:

--- a/tools/wlist.json
+++ b/tools/wlist.json
 {
+    "wlist_file" : [
+        "../python/paddle/fluid/contrib", 
+        "../python/paddle/verison.py",
+        "../python/paddle/fluid/core_avx.py",
+        "../python/paddle/distributed"
+    ],
    "wlist_inneed":[
        "append_LARS",
        "BuildStrategy.debug_graphviz_path",