diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 055c0bc57c51d7c091601a4f98eb58677148a18f..bdf402397dd38f66484abc8497185c8b8a762035 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -20,6 +20,9 @@
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/diag_op.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -89,7 +92,6 @@ struct PowFunctor {
 };
 
 static std::vector<int> GetBroadcastShape(InTensors ins) {
-  // TODO(xiongkun03) check the operators and output
   PADDLE_ENFORCE_EQ(ins.size(), 2, platform::errors::InvalidArgument(
                                        "GetBroadcastShape Receive 2 tensors"
                                        "but got [%d]",
@@ -125,6 +127,19 @@ static std::vector<int> GetBroadcastShape(InTensors ins) {
   return broadcast_shape;
 }
 
+#define DITO_TRANSPOSE_RANK_CASE(N)             \
+  case N: {                                     \
+    math::Transpose<DeviceContext, T, N> trans; \
+    trans(dev_ctx, x, &ret, axis);              \
+    break;                                      \
+  }
+
+#define DITO_SLICE_RANK_CASE(N)                      \
+  case N: {                                          \
+    EigenSliceWrapper<N>(&x, offset, extends, &ret); \
+    break;                                           \
+  }
+
 template <typename DeviceContext, typename T>
 struct DeviceIndependenceTensorOperations {
   // 1. Device indenpendence, for kernel reuse.
@@ -153,20 +168,25 @@ struct DeviceIndependenceTensorOperations {
   framework::Tensor Matmul(const framework::Tensor& mat_a,
                            const framework::Tensor& mat_b, bool trans_a = false,
                            bool trans_b = false) {
-    framework::AttributeMap attrs;
-    attrs["trans_x"] = trans_a;
-    attrs["trans_y"] = trans_b;
-    NameInTensorMap inputs({{"X", {&mat_a}}, {"Y", {&mat_b}}});
+    framework::Tensor ret;
     auto a_dim = mat_a.dims();
     auto b_dim = mat_b.dims();
     std::vector<int> x_vec = framework::vectorize<int>(a_dim);
     x_vec[x_vec.size() - 2] = a_dim[a_dim.size() - (trans_a ? 1 : 2)];
     x_vec[x_vec.size() - 1] = b_dim[b_dim.size() - (trans_b ? 2 : 1)];
-    return CreateOpRunAndReturnTensor("matmul_v2", inputs, attrs, x_vec);
+    ret.Resize(framework::make_ddim(x_vec));
+    ret.mutable_data<T>(context.GetPlace());
+    auto blas = GetBlas();
+    auto mat_a_discrib = math::CreateMatrixDescriptor(a_dim, 0, trans_a);
+    auto mat_b_discrib = math::CreateMatrixDescriptor(b_dim, 0, trans_b);
+    blas.MatMul(mat_a, mat_a_discrib, mat_b, mat_b_discrib, T(1.0), &ret,
+                T(0.0));
+    return ret;
   }
-  // transpose the last two dimision
+
   framework::Tensor Transpose(const framework::Tensor& x) {
-    framework::Tensor out;
+    // transpose the last two dimision
+    framework::Tensor ret;
     auto x_dim = x.dims();
     auto x_vec = framework::vectorize<int>(x_dim);
     int rank = x_vec.size();
@@ -177,26 +197,42 @@ struct DeviceIndependenceTensorOperations {
       axis[i] = i;
     }
     std::swap(axis[rank - 1], axis[rank - 2]);
-    framework::AttributeMap attrs;
-    attrs["axis"] = axis;
-    NameInTensorMap inputs({{"X", {&x}}});
-    return CreateOpRunAndReturnTensor("transpose2", inputs, attrs, out_shape,
-                                      {"Out", "XShape"});
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    ret.Resize(framework::make_ddim(x_vec));
+    ret.mutable_data<T>(context.GetPlace());
+    switch (rank) {
+      DITO_TRANSPOSE_RANK_CASE(2);
+      DITO_TRANSPOSE_RANK_CASE(3);
+      DITO_TRANSPOSE_RANK_CASE(4);
+      DITO_TRANSPOSE_RANK_CASE(5);
+      DITO_TRANSPOSE_RANK_CASE(6);
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid Rank number, "
+            "currently only support rank between 2~6"));
+      }
+    }
+    return ret;
   }
-
   framework::Tensor Diag(const framework::Tensor& x, int offset = 0,
+                         // FIXME  link error
                          int padding_value = 0) {
-    framework::AttributeMap attrs;
-    attrs["offset"] = offset;
-    attrs["padding_value"] = padding_value;
-    NameInTensorMap inputs({{"X", {&x}}});
+    PADDLE_ENFORCE_EQ(padding_value, 0,
+                      platform::errors::InvalidArgument(
+                          "Current diag only support padding_value = 0"));
+    PADDLE_ENFORCE_EQ(offset, 0,
+                      platform::errors::InvalidArgument(
+                          "Current diag only support offset = 0,"
+                          "you can use DiagOp instead(not recommend)"));
+
+    framework::Tensor ret;
     int x_rank = x.dims().size();
     std::vector<int> out_shape;
     if (x_rank == 2) {
-      PADDLE_ENFORCE_EQ(x.dims()[0], x.dims()[1],
-                        platform::errors::InvalidArgument(
-                            "if X is a Matrix, then X must be square"));
-      out_shape.push_back(x.dims()[0]);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Current diag only support vector"
+          "-> diagonalized matrix, not support matrix -> vector,"
+          " Use DiagOp instead."));
     } else if (x_rank == 1) {
       out_shape.push_back(x.dims()[0]);
       out_shape.push_back(x.dims()[0]);
@@ -204,42 +240,73 @@ struct DeviceIndependenceTensorOperations {
       PADDLE_THROW(
           platform::errors::InvalidArgument("Rank must less or equal than 2"));
     }
-    return CreateOpRunAndReturnTensor("diag_v2", inputs, attrs, out_shape);
+    ret = Fill({out_shape[0], out_shape[0]}, 0.0);
+    T* output = ret.mutable_data<T>(context.GetPlace());
+    auto for_range = GetForRange(x.numel());
+    for_range(DiagFunctor<T>(x.data<T>(), x.numel(), output));
+    return ret;
+  }
+  framework::Tensor Div(const framework::Tensor& x,
+                        const framework::Tensor& y) {
+    framework::Tensor ret;
+    std::vector<int> out_shape = GetBroadcastShape({&x, &y});
+    ret.Resize(framework::make_ddim(out_shape));
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
+        context, &x, &y, -1, DivFunctor<T>(), &ret);
+    return ret;
   }
-
   framework::Tensor Add(const framework::Tensor& x,
                         const framework::Tensor& y) {
-    InTensors ins({&x, &y});
-    framework::AttributeMap attrs;
-    attrs["axis"] = -1;
+    // element wise add, support numpy broadcast.
+    framework::Tensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    NameInTensorMap inputs({{"X", {&x}}, {"Y", {&y}}});
-    return CreateOpRunAndReturnTensor("elementwise_add", inputs, attrs,
-                                      out_shape);
+    ret.Resize(framework::make_ddim(out_shape));
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+        context, &x, &y, -1, AddFunctor<T>(), &ret);
+    return ret;
   }
-
   framework::Tensor Mul(const framework::Tensor& x,
                         const framework::Tensor& y) {
-    InTensors ins({&x, &y});
-    framework::AttributeMap attrs;
-    attrs["axis"] = -1;
+    framework::Tensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    NameInTensorMap inputs({{"X", {&x}}, {"Y", {&y}}});
-    return CreateOpRunAndReturnTensor("elementwise_mul", inputs, attrs,
-                                      out_shape);
+    ret.Resize(framework::make_ddim(out_shape));
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+        context, &x, &y, -1, MulFunctor<T>(), &ret);
+    return ret;
+  }
+
+  framework::Tensor ReduceSum(const framework::Tensor& x,
+                              std::vector<int> out_dim) {
+    framework::AttributeMap attrs;
+    attrs["dim"] = std::vector<int>{-1};
+    NameInTensorMap inputs({{"X", {&x}}});
+    return CreateOpRunAndReturnTensor("reduce_sum", inputs, attrs, out_dim);
+  }
+
+  framework::Tensor ReduceMax(const framework::Tensor& x,
+                              std::vector<int> out_dim) {
+    framework::AttributeMap attrs;
+    attrs["dim"] = std::vector<int>{-1};
+    NameInTensorMap inputs({{"X", {&x}}});
+    return CreateOpRunAndReturnTensor("reduce_max", inputs, attrs, out_dim);
   }
 
   framework::Tensor Sub(const framework::Tensor& x,
                         const framework::Tensor& y) {
-    InTensors ins({&x, &y});
-    framework::AttributeMap attrs;
-    attrs["axis"] = -1;
+    framework::Tensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    NameInTensorMap inputs({{"X", {&x}}, {"Y", {&y}}});
-    return CreateOpRunAndReturnTensor("elementwise_sub", inputs, attrs,
-                                      out_shape);
+    ret.Resize(framework::make_ddim(out_shape));
+    if (x.dims().size() >= y.dims().size()) {
+      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+          context, &x, &y, -1, SubFunctor<T>(), &ret);
+    } else {
+      ElementwiseComputeEx<InverseSubFunctor<T>, DeviceContext, T>(
+          // This is copyed from elementwise_sub, which means we
+          // need reverse will xrank < yrank
+          context, &x, &y, -1, InverseSubFunctor<T>(), &ret);
+    }
+    return ret;
   }
-
   const framework::Tensor Unsqueeze(const framework::Tensor& x, int axis = 0) {
     // don't copy data, only change the dims
     framework::Tensor out;
@@ -255,40 +322,29 @@ struct DeviceIndependenceTensorOperations {
     out.Resize(framework::make_ddim(out_shape));
     return out;
   }
-
-  framework::Tensor Zeros(std::vector<int> shape,
-                          framework::proto::VarType::Type dtype,
-                          float fill_value) {
-    framework::AttributeMap attrs;
-    attrs["dtype"] = dtype;
-    attrs["shape"] = shape;
-    attrs["value"] = fill_value;
-    NameInTensorMap inputs({});
-    return CreateOpRunAndReturnTensor("fill_constant", inputs, attrs, shape);
+  framework::Tensor Fill(std::vector<int> shape, float fill_value) {
+    framework::Tensor ret;
+    ret.Resize(framework::make_ddim(shape));
+    ret.mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    SetConstant<DeviceContext, T>()(dev_ctx, &ret, T(fill_value));
+    return ret;
   }
-
-  framework::Tensor Infinits(std::vector<int> shape,
-                             framework::proto::VarType::Type dtype) {
-    framework::AttributeMap attrs;
-    attrs["dtype"] = dtype;
-    attrs["shape"] = shape;
-    attrs["str_value"] = std::string("inf");
-    NameInTensorMap inputs({});
-    return CreateOpRunAndReturnTensor("fill_constant", inputs, attrs, shape);
+  framework::Tensor Infinits(std::vector<int> shape) {
+    auto value = static_cast<T>(std::numeric_limits<double>::infinity());
+    return Fill(shape, value);
   }
-
-  framework::Tensor Eye(int n, framework::proto::VarType::Type dtype) {
-    auto output = Zeros({n}, dtype, 1);
+  framework::Tensor Eye(int n) {
+    auto output = Fill({n}, 1);
     auto ret = Diag(output);
     return ret;
   }
-
   framework::Tensor Slice(const framework::Tensor& x, std::vector<int> axes,
                           std::vector<int> starts, std::vector<int> ends) {
+    framework::Tensor ret;
     std::vector<int> new_axes = axes;
-    NameInTensorMap inputs({{"Input", {&x}}});
     std::vector<int> out_shape = framework::vectorize<int>(x.dims());
-    int rank = out_shape.size();
+    size_t rank = out_shape.size();
     PADDLE_ENFORCE_EQ(
         axes.size(), starts.size(),
         platform::errors::InvalidArgument("Slice Operator Argument Invalided"));
@@ -306,27 +362,31 @@ struct DeviceIndependenceTensorOperations {
                             "C++ Slice Operation Not Support End < Start"));
       out_shape[axis] = ed - st;
     }
-    framework::AttributeMap attrs;
-    attrs["axes"] = new_axes;
-    attrs["starts"] = starts;
-    attrs["ends"] = ends;
-    return CreateOpRunAndReturnTensor("slice", inputs, attrs, out_shape);
-  }
-
-  framework::Tensor ReduceSum(const framework::Tensor& x,
-                              std::vector<int> out_dim) {
-    framework::AttributeMap attrs;
-    attrs["dim"] = std::vector<int>{-1};
-    NameInTensorMap inputs({{"X", {&x}}});
-    return CreateOpRunAndReturnTensor("reduce_sum", inputs, attrs, out_dim);
-  }
-
-  framework::Tensor ReduceMax(const framework::Tensor& x,
-                              std::vector<int> out_dim) {
-    framework::AttributeMap attrs;
-    attrs["dim"] = std::vector<int>{-1};
-    NameInTensorMap inputs({{"X", {&x}}});
-    return CreateOpRunAndReturnTensor("reduce_max", inputs, attrs, out_dim);
+    std::vector<int> offset(rank), extends(rank);
+    for (size_t i = 0; i < rank; ++i) {
+      offset[i] = 0;
+      extends[i] = x.dims()[i];
+    }
+    for (size_t i = 0; i < new_axes.size(); ++i) {
+      offset[new_axes[i]] = starts[i];
+      extends[new_axes[i]] = ends[i] - starts[i];
+    }
+    ret.Resize(framework::make_ddim(out_shape));
+    ret.mutable_data<T>(context.GetPlace());
+    switch (rank) {
+      DITO_SLICE_RANK_CASE(1);
+      DITO_SLICE_RANK_CASE(2);
+      DITO_SLICE_RANK_CASE(3);
+      DITO_SLICE_RANK_CASE(4);
+      DITO_SLICE_RANK_CASE(5);
+      DITO_SLICE_RANK_CASE(6);
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid Rank number, "
+            "currently only support rank between 2~6"));
+      }
+    }
+    return ret;
   }
 
  private:
@@ -338,14 +398,40 @@ struct DeviceIndependenceTensorOperations {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     return platform::ForRange<DeviceContext>(dev_ctx, numel);
   }
-
+  template <size_t D>
+  void EigenSliceWrapper(const framework::Tensor* in,
+                         const std::vector<int>& start,
+                         const std::vector<int>& end, framework::Tensor* out) {
+    // Slice by call Eigen Tensor Function `.slice()`
+    size_t rank = in->dims().size();
+    PADDLE_ENFORCE_EQ(start.size(), rank,
+                      platform::errors::InvalidArgument(
+                          "EigenSliceWrapper function start "
+                          "argument must have the same length as input rank."));
+    PADDLE_ENFORCE_EQ(end.size(), rank,
+                      platform::errors::InvalidArgument(
+                          "EigenSliceWrapper function end "
+                          "argument must have the same length as input rank."));
+    auto eigen_place_ptr =
+        context.template device_context<DeviceContext>().eigen_device();
+    auto eigen_place = *eigen_place_ptr;
+    auto out_t = framework::EigenTensor<T, D>::From(*out, out->dims());
+    auto in_t = framework::EigenTensor<T, D>::From(*in, in->dims());
+    Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+    for (size_t i = 0; i < D; i++) {
+      offsets_32bit[i] = start[i];
+      extents_32bit[i] = end[i];
+    }
+    EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+        eigen_place, framework::To32BitIndex(out_t),
+        framework::To32BitIndex(in_t), offsets_32bit, extents_32bit);
+  }
   framework::Tensor CreateOpRunAndReturnTensor(
       const std::string& type, const NameInTensorMap& inputs,
       const framework::AttributeMap& attrs, std::vector<int> out_shape,
       NameOutTensor out_str = {"Out"}) {
     // varialble set dims must be LoDTensor / SelectedRowTensor
     framework::Scope& local_scope = context.scope().NewScope();
-
     framework::VariableNameMap op_outputs;
     for (auto out_name : out_str) {
       local_scope.Var("tmp_" + out_name)->GetMutable<framework::LoDTensor>();
@@ -373,6 +459,7 @@ struct DeviceIndependenceTensorOperations {
       }
       op_inputs[item.first] = name_vector;
     }
+
     auto op =
         framework::OpRegistry::CreateOp(type, op_inputs, op_outputs, attrs);
     op->Run(local_scope, context.GetPlace());
diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h
index 1910effbeaa54d6b718fc39f4957a56d83902d77..f387dca7b7f9b2c4e741d8f495a58b05a46c6c6f 100644
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
@@ -54,7 +54,6 @@ class SvdCPUKernel : public framework::OpKernel<T> {
         size_t(batches * col_v * cols * sizeof(math::Real<T>)));
     auto* S_out = S->mutable_data<math::Real<T>>(
         context.GetPlace(), size_t(batches * k * sizeof(math::Real<T>)));
-
     /*SVD Use the Eigen Library*/
     math::BatchSvd<T>(x_data, U_out, VH_out, S_out, rows, cols, batches, full);
   }
@@ -96,7 +95,7 @@ class SvdGradKernel : public framework::OpKernel<T> {
     auto s_square = dito.Pow(S, 2);
     auto F =
         dito.Sub(dito.Unsqueeze(s_square, -2), dito.Unsqueeze(s_square, -1));
-    F = dito.Add(F, dito.Diag(dito.Infinits({k}, U.type())));
+    F = dito.Add(F, dito.Diag(dito.Infinits({k})));
     F = dito.Pow(F, -1);
     Tensor sigma_term;
     Tensor u_term;
@@ -115,8 +114,7 @@ class SvdGradKernel : public framework::OpKernel<T> {
       u_term = dito.Mul(dito.Mul(dito.Sub(UTG, GTU), F), dito.Unsqueeze(S, -2));
       u_term = dito.Matmul(U, u_term);
       if (m > k) {
-        auto project =
-            dito.Sub(dito.Eye(m, U.type()), dito.Matmul(U, U, false, true));
+        auto project = dito.Sub(dito.Eye(m), dito.Matmul(U, U, false, true));
         u_term = dito.Add(u_term, dito.Mul(dito.Matmul(project, dU),
                                            dito.Unsqueeze(s_inverse, -2)));
       }
@@ -129,8 +127,7 @@ class SvdGradKernel : public framework::OpKernel<T> {
       v_term = dito.Mul(dito.Matmul(dito.Mul(dito.Sub(UTG, GTU), F), VH),
                         dito.Unsqueeze(S, -1));
       if (n > k) {
-        auto project =
-            dito.Sub(dito.Eye(n, U.type()), dito.Matmul(VH, VH, true, false));
+        auto project = dito.Sub(dito.Eye(n), dito.Matmul(VH, VH, true, false));
         v_term = dito.Add(v_term, dito.Mul(dito.Matmul(dVH, project),
                                            dito.Unsqueeze(s_inverse, -1)));
       }
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index d0b705cde6aa1cb4937ea207d0ab90340a665799..0ee9f4eed82df44d3bc02e2414fb782e78ae1b6f 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -100,7 +100,6 @@ from .tensor.linalg import bmm  # noqa: F401
 from .tensor.linalg import histogram  # noqa: F401
 from .tensor.linalg import mv  # noqa: F401
 from .tensor.linalg import matrix_power  # noqa: F401
-from .tensor.linalg import svd  # noqa: F401
 from .tensor.logic import equal  # noqa: F401
 from .tensor.logic import greater_equal  # noqa: F401
 from .tensor.logic import greater_than  # noqa: F401
@@ -498,7 +497,6 @@ __all__ = [  # noqa
            'sqrt',
            'cholesky',
            'matrix_power',
-           'svd',
            'randperm',
            'linspace',
            'reshape',
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 24448905b955c61b4e237518c7e55654a59095a8..4a70cd3c7e10b094040b2e62b91e5996e0d8e317 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -889,7 +889,7 @@ set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPER
 set_tests_properties(test_lstm_cudnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_stack_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_v2_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_svd_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_svd_op PROPERTIES TIMEOUT 40)
 set_tests_properties(test_deformable_psroi_pooling PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_static_runner_mnist PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 73369a6e8ea14464eb46d26b664c73f25ef48e02..19624cf6b8fda908784768638dabc9edfd7598e4 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -45,7 +45,6 @@ from .linalg import bmm  # noqa: F401
 from .linalg import histogram  # noqa: F401
 from .linalg import mv  # noqa: F401
 from .linalg import matrix_power  # noqa: F401
-from .linalg import svd  # noqa: F401
 from .logic import equal  # noqa: F401
 from .logic import greater_equal  # noqa: F401
 from .logic import greater_than  # noqa: F401
@@ -226,7 +225,6 @@ tensor_method_func  = [ #noqa
            'histogram',
            'mv',
            'matrix_power',
-           'svd',
            'abs',
            'acos',
            'all',
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 39c67b072b60ad18206a930705b75aab6c196cb9..b50643471eed7d36e491bee38b89e6122f70df87 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1036,46 +1036,51 @@ def mv(x, vec, name=None):
 
 def svd(x, full_matrices=False, name=None):
     r"""
-    Computes the singular value decomposition of one 
-    matrix or batches of regular matrice.
+    Computes the singular value decomposition of one matrix or a batch of regular matrices.
+
+    Let :math:`X` be the input matrix or a batch of input matrices, the output should satisfies:
+
+    .. math::
+        X = U * diag(S) * VT 
+ 
     Args:
         x (Tensor): The input tensor. Its shape should be `[..., N, M]`,
-            where ... is zero or more batch dimensions. N and M can be arbitraty
+            where `...` is zero or more batch dimensions. N and M can be arbitraty
             positive number. Note that if x is sigular matrices, the grad is numerical 
-            instability. The data type of x should be float32 or float64. 
-
-        full_matrices(bool): A flag to control the behavor of svd. 
+            instable. The data type of x should be float32 or float64. 
+        full_matrices (bool): A flag to control the behavor of svd. 
             If full_matrices = True, svd op will compute full U and V matrics, 
-            which means shape of U is `[..., N, N]`, shape of V is `[..., M, M]`.
+            which means shape of U is `[..., N, N]`, shape of V is `[..., M, M]`. K = min(M, N).
             If full_matrices = False, svd op will use a economic method to store U and V. 
-            which means shape of U is `[..., N, K]`, shape of V is `[..., M, K]`
+            which means shape of U is `[..., N, K]`, shape of V is `[..., M, K]`. K = min(M, N).
+        name (str, optional): Name for the operation (optional, default is None). 
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: Tensor U, the shape of U is controlled by full_matrices flag.
-        Tensor: Tensor S, the singular value of X. the shape of S is [..., K]
-        Tensor: Tensor VH, the conjugate transpose of V. the shape of V is controlled by full_matrices flag. 
+        Tuple of 3 tensors: (U, S, VH). VH is the conjugate transpose of V. S is the singlar value vectors of matrics with shape `[..., K]`
 
-            import numpy as np
+    Examples:
+        .. code-block:: python
+
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [1.0, 3.0], [4.0, 6.0]]).astype('float64')
             x = x.reshape([3, 2])
-            u, s, vt = paddle.linalg.svd(x)
+            u, s, vh = paddle.linalg.svd(x)
             print (u)
-            print (s)
-            print (vt)
-
             #U = [[ 0.27364809, -0.21695147  ],
             #      [ 0.37892198, -0.87112408 ],
             #      [ 0.8840446 ,  0.44053933 ]]
 
+            print (s)
             #S = [8.14753743, 0.78589688]
-
+            print (vh)
             #VT= [[ 0.51411221,  0.85772294],
             #     [ 0.85772294, -0.51411221]]
             
-            # one can verify : U * S * VT = X ;     
-            #                  U * UH = I ; 
-            #                  V * VH = I
+            # one can verify : U * S * VT == X
+            #                  U * UH == I 
+            #                  V * VH == I
     """
 
     if in_dygraph_mode():