diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e9bddf01e82f82d15d2d4bbe481009898f7c414
--- /dev/null
+++ b/paddle/fluid/operators/dist_op.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dist_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+
+class DistOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dist");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Dist");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Dist");
+    ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class DistOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input Tensor of Dist Op.");
+    AddInput("Y", "The Right-hand-side input Tensor of Dist Op.");
+    AddOutput("Out",
+              "The output of Dist Op, "
+              "which is the p-norm of (X - Y)");
+    AddAttr<float>("p", "the norm to be computed.").SetDefault(2.0f);
+    AddComment(R"DOC(
+Dist Operator.
+Given two tensors X and Y, compute Lp-norm of (X-Y). It is not a norm in a strict sense,
+only as a measure of distance. The shapes of X and Y must be broadcastable. Where, Z = X - Y,
+
+When p = 0, defining $0^0 = 0$, the zero-norm of Z is simply the number of non-zero elements of z.
+$$
+||Z||_{0} = \lim_{p \rightarrow 0} \sum_{i=1}^{m} |z_i|^p
+$$
+
+When p = inf, the inf-norm of Z is the maximum element of Z.
+$$
+||Z||_\infty=\max_i |z_i|
+$$
+
+When p = -inf, the negative-inf-norm of Z is the minimum element of Z.
+$$
+||Z||_{-\infty}=\min_i |z_i|
+$$
+
+Otherwise, the p-norm of Z follows the formula,
+$$
+||Z||_{p} = (\sum_{i=i}^{m} |z_i|^p)^{1/p}
+$$
+    )DOC");
+  }
+};
+
+class DistOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Y"))) {
+      ctx->SetOutputDim(framework::GradVarName("Y"), y_dims);
+    }
+  }
+};
+
+template <typename T>
+class DistGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker,
+                  ops::DistGradOpMaker<paddle::framework::OpDesc>,
+                  ops::DistGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(dist_grad, ops::DistOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    dist, ops::DistKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DistKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    dist_grad, ops::DistGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DistGradKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/fluid/operators/dist_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..499f5572910dd7666973bf077bf919a0378cfe52
--- /dev/null
+++ b/paddle/fluid/operators/dist_op.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dist_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DistKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DistGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b35b4ce1d2787c801c8f0af46d1f7125d6ddb5ad
--- /dev/null
+++ b/paddle/fluid/operators/dist_op.h
@@ -0,0 +1,286 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using framework::Tensor;
+
+template <int Rank>
+static void GetBraodcastDims(const framework::DDim& x_dims,
+                             const framework::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0,
+                    platform::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims, y_dims));
+}
+
+static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return framework::make_ddim(new_dims_vec);
+}
+
+template <typename DeviceContext, typename T, int Rank>
+static void DistFunction(const framework::ExecutionContext& context) {
+  auto* x = context.Input<Tensor>("X");
+  auto* y = context.Input<Tensor>("Y");
+  auto* out = context.Output<Tensor>("Out");
+  auto p = context.Attr<float>("p");
+  out->mutable_data<T>(context.GetPlace());
+
+  auto x_dims = context.Input<Tensor>("X")->dims();
+  auto y_dims = context.Input<Tensor>("Y")->dims();
+
+  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
+  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
+
+  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
+  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
+  auto out_t = EigenTensor<T, 1>::From(*out);
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  // p=0 means number of non-zero elements of (x-y)
+  // p=inf means the maximum of |x-y|
+  // p=-inf means the minimum of |x-y|
+  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
+  if (p == 0) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
+            .template cast<T>()
+            .sum();
+  } else if (p == INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .maximum();
+  } else if (p == -INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .minimum();
+  } else {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .pow(p)
+            .sum()
+            .pow(1.0 / p);
+  }
+}
+
+template <typename DeviceContext, typename T, int Rank>
+static void DistGradFunction(const framework::ExecutionContext& context) {
+  auto* x = context.Input<Tensor>("X");
+  auto* y = context.Input<Tensor>("Y");
+  auto* out = context.Input<Tensor>("Out");
+  auto p = context.Attr<float>("p");
+
+  auto x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+  auto y_grad = context.Output<Tensor>(framework::GradVarName("Y"));
+  auto out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+
+  auto x_dims = context.Input<Tensor>("X")->dims();
+  auto y_dims = context.Input<Tensor>("Y")->dims();
+  auto out_dims = context.Input<Tensor>("Out")->dims();
+
+  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
+  framework::DDim out_new_dims = GetNewDims(out_dims, Rank);
+  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
+  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
+  auto out_t = EigenTensor<T, Rank>::From(*out, out_new_dims);
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  Eigen::DSizes<int, Rank> out_bcast_dims;
+
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  std::vector<int64_t> new_dims_vec(Rank);
+  for (int i = 0; i < Rank; ++i) {
+    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
+    out_bcast_dims[i] = new_dims_vec[i];
+  }
+  framework::DDim new_dims = framework::make_ddim(new_dims_vec);
+
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  auto out_grad_t = EigenTensor<T, Rank>::From(*out_grad, out_new_dims);
+  framework::Tensor grad;
+  grad.mutable_data<T>(new_dims, context.GetPlace());
+  auto grad_t = EigenTensor<T, Rank>::From(grad);
+
+  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
+  auto x_minux_y_abs = x_minux_y.abs();
+  auto sign =
+      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
+      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+
+  // 1: Lp-norm(z), z = x-y, compute dz
+  if (p == 0) {
+    grad_t.device(place) = grad_t * static_cast<T>(0);
+  } else if (p == INFINITY || p == -INFINITY) {
+    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
+    // j!=i, or equals to sign(z_i) * dout if j=i.
+    grad_t.device(place) =
+        (x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() *
+        sign * out_grad_t.broadcast(out_bcast_dims);
+  } else {
+    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
+    grad_t.device(place) =
+        (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
+        out_grad_t.broadcast(out_bcast_dims);
+  }
+
+  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
+  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
+  Eigen::DSizes<int, Rank> reduce_dims;
+  for (int i = 0; i < x_new_dims.size(); ++i) {
+    x_reshape_dims[2 * i] = x_bcast_dims[i];
+    x_reshape_dims[2 * i + 1] = x_new_dims[i];
+    y_reshape_dims[2 * i] = y_bcast_dims[i];
+    y_reshape_dims[2 * i + 1] = y_new_dims[i];
+    reduce_dims[i] = 2 * i;
+  }
+
+  // 2: if x or y is broadcasted in forward function,
+  // the grad need to be sum along the broadcasted dimensions
+  if (x_grad) {
+    x_grad->mutable_data<T>(context.GetPlace());
+    auto x_grad_t = EigenTensor<T, Rank>::From(*x_grad, x_new_dims);
+    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
+                                 .sum(reduce_dims)
+                                 .reshape(x_grad_t.dimensions());
+  }
+  if (y_grad) {
+    y_grad->mutable_data<T>(context.GetPlace());
+    auto y_grad_t = EigenTensor<T, Rank>::From(*y_grad, y_new_dims);
+    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
+                                  .sum(reduce_dims)
+                                  .reshape(y_grad_t.dimensions());
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DistKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto x_rank = context.Input<Tensor>("X")->dims().size();
+    auto y_rank = context.Input<Tensor>("Y")->dims().size();
+    auto rank = std::max(x_rank, y_rank);
+    PADDLE_ENFORCE_LE(rank, 6,
+                      platform::errors::Unimplemented(
+                          "Op(dist) only support tensors with no more than 6 "
+                          "dimensions, but X's rank is %d, Y's rank is %d.",
+                          x_rank, y_rank));
+    switch (rank) {
+      case 1:
+        DistFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        DistFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        DistFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        DistFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        DistFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        DistFunction<DeviceContext, T, 6>(context);
+        break;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DistGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto x_rank = context.Input<Tensor>("X")->dims().size();
+    auto y_rank = context.Input<Tensor>("Y")->dims().size();
+    auto rank = std::max(x_rank, y_rank);
+    PADDLE_ENFORCE_LE(rank, 6,
+                      platform::errors::Unimplemented(
+                          "Op(dist) only support tensors with no more than 6 "
+                          "dimensions, but X's rank is %d, Y's rank is %d.",
+                          x_rank, y_rank));
+    switch (rank) {
+      case 1:
+        DistGradFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        DistGradFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        DistGradFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        DistGradFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        DistGradFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        DistGradFunction<DeviceContext, T, 6>(context);
+        break;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 72609882d713414d910b42a15c3706383dc7dd4d..32c4c68168da1a893706331e082f68d22f3a7674 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -152,7 +152,7 @@ from .tensor.linalg import matmul  #DEFINE_ALIAS
 # from .tensor.linalg import einsum   #DEFINE_ALIAS
 # from .tensor.linalg import morm   #DEFINE_ALIAS
 # from .tensor.linalg import transpose   #DEFINE_ALIAS
-# from .tensor.linalg import dist   #DEFINE_ALIAS
+from .tensor.linalg import dist  #DEFINE_ALIAS
 # from .tensor.linalg import t   #DEFINE_ALIAS
 # from .tensor.linalg import cross   #DEFINE_ALIAS
 # from .tensor.linalg import cholesky   #DEFINE_ALIAS
diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f46e0e7f9ca97409a7c6ea634ed96421e593f5f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_op.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+def dist(x, y, p):
+    if p == 0.:
+        out = np.count_nonzero(x - y)
+    elif p == float("inf"):
+        out = np.max(np.abs(x - y))
+    elif p == float("-inf"):
+        out = np.min(np.abs(x - y))
+    else:
+        out = np.power(np.sum(np.power(np.abs(x - y), p)), 1.0 / p)
+    return np.array(out).astype(x.dtype)
+
+
+class TestDistOp(OpTest):
+    def setUp(self):
+        self.op_type = 'dist'
+        self.attrs = {}
+        self.init_case()
+        self.inputs = {
+            "X": np.random.random(self.x_shape).astype("float64"),
+            "Y": np.random.random(self.y_shape).astype("float64")
+        }
+
+        self.attrs["p"] = self.p
+        self.outputs = {
+            "Out": dist(self.inputs["X"], self.inputs["Y"], self.attrs["p"])
+        }
+        self.gradient = self.calc_gradient()
+
+    def init_case(self):
+        self.x_shape = (120)
+        self.y_shape = (120)
+        self.p = 0.
+
+    def calc_gradient(self):
+        x = self.inputs["X"]
+        y = self.inputs["Y"]
+        p = self.attrs["p"]
+        if p == 0:
+            grad = np.zeros(x.shape)
+        elif p in [float("inf"), float("-inf")]:
+            norm = dist(x, y, p)
+            x_minux_y_abs = np.abs(x - y)
+            grad = np.sign(x - y)
+            grad[x_minux_y_abs != norm] = 0
+        else:
+            norm = dist(x, y, p)
+            grad = np.power(norm, 1 - p) * np.power(np.abs(x - y),
+                                                    p - 1) * np.sign(x - y)
+
+        def get_reduce_dims(x, y):
+            x_reduce_dims = []
+            y_reduce_dims = []
+
+            if x.ndim >= y.ndim:
+                y_reshape = tuple([1] * (x.ndim - y.ndim) + list(y.shape))
+                y = y.reshape(y_reshape)
+            else:
+                x_reshape = tuple([1] * (y.ndim - x.ndim) + list(x.shape))
+                x = x.reshape(x_reshape)
+            for i in range(x.ndim):
+                if x.shape[i] > y.shape[i]:
+                    y_reduce_dims.append(i)
+                elif x.shape[i] < y.shape[i]:
+                    x_reduce_dims.append(i)
+            return x_reduce_dims, y_reduce_dims
+
+        x_reduce_dims, y_reduce_dims = get_reduce_dims(x, y)
+        if len(x_reduce_dims) != 0:
+            x_grad = np.sum(grad, tuple(x_reduce_dims)).reshape(x.shape)
+        else:
+            x_grad = grad
+        if len(y_reduce_dims) != 0:
+            y_grad = -np.sum(grad, tuple(y_reduce_dims)).reshape(y.shape)
+        else:
+            y_grad = -grad
+
+        return x_grad, y_grad
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X", "Y"], "Out", user_defined_grads=self.gradient)
+
+
+class TestDistOpCase1(TestDistOp):
+    def init_case(self):
+        self.x_shape = (3, 5, 5, 6)
+        self.y_shape = (5, 5, 6)
+        self.p = 1.
+
+
+class TestDistOpCase2(TestDistOp):
+    def init_case(self):
+        self.x_shape = (10, 10)
+        self.y_shape = (4, 10, 10)
+        self.p = 2.
+
+
+class TestDistOpCase3(TestDistOp):
+    def init_case(self):
+        self.x_shape = (15, 10)
+        self.y_shape = (15, 10)
+        self.p = float("inf")
+
+
+class TestDistOpCase4(TestDistOp):
+    def init_case(self):
+        self.x_shape = (2, 3, 4, 5, 8)
+        self.y_shape = (3, 1, 5, 8)
+        self.p = float("-inf")
+
+
+class TestDistOpCase5(TestDistOp):
+    def init_case(self):
+        self.x_shape = (4, 1, 4, 8)
+        self.y_shape = (2, 2, 1, 4, 4, 8)
+        self.p = 1.5
+
+
+class TestDistAPI(unittest.TestCase):
+    def test_api(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.data(name='x', shape=[2, 3, 4, 5], dtype='float64')
+            y = fluid.data(name='y', shape=[3, 1, 5], dtype='float64')
+            p = 2
+            x_i = np.random.random((2, 3, 4, 5)).astype("float64")
+            y_i = np.random.random((3, 1, 5)).astype("float64")
+            result = paddle.dist(x, y, p)
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            out = exe.run(fluid.default_main_program(),
+                          feed={'x': x_i,
+                                'y': y_i},
+                          fetch_list=[result])
+            self.assertTrue(np.allclose(dist(x_i, y_i, p), out[0]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index fb4296f0013390b8cf03b636c1d032bf07ec8f91..40476b49a371a5881ae627d443c8678af196babb 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -127,7 +127,7 @@ from .linalg import matmul  #DEFINE_ALIAS
 # from .linalg import einsum   #DEFINE_ALIAS
 # from .linalg import morm   #DEFINE_ALIAS
 # from .linalg import transpose   #DEFINE_ALIAS
-# from .linalg import dist   #DEFINE_ALIAS
+from .linalg import dist  #DEFINE_ALIAS
 # from .linalg import t   #DEFINE_ALIAS
 # from .linalg import cross   #DEFINE_ALIAS
 # from .linalg import cholesky   #DEFINE_ALIAS
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 974bc0d37a0bad3b42e550c2ceb42203152ba05f..7baba355180ccecd3119999f6b2cfd91b7e350c4 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.common_ops_import import *
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.data_feeder import check_variable_and_dtype, check_type
+from ..fluid.framework import in_dygraph_mode
 
 # TODO: define functions of linear algebra   
 __all__ = [
@@ -20,7 +23,7 @@ __all__ = [
     #  'einsum',
     #  'morm',
     #  'transpose',
-    #  'dist',
+    'dist',
     #  't',
     #  'cross',
     #  'cholesky',
@@ -156,3 +159,78 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
         outputs={'Out': out},
         attrs=attrs)
     return out
+
+
+def dist(x, y, p=2):
+    """
+    This OP returns the p-norm of (x - y). It is not a norm in a strict sense, only as a measure
+    of distance. The shapes of x and y must be broadcastable.
+
+    Where, z = x - y,
+
+    When p = 0, defining $0^0=0$, the zero-norm of z is simply the number of non-zero elements of z.
+
+    .. math::
+
+        ||z||_{0}=\lim_{p \\rightarrow 0}\sum_{i=1}^{m}|z_i|^{p}
+
+    When p = inf, the inf-norm of z is the maximum element of z.
+
+    .. math::
+
+        ||z||_\infty=\max_i |z_i|
+
+    When p = -inf, the negative-inf-norm of z is the minimum element of z.
+
+    .. math::
+
+        ||z||_{-\infty}=\min_i |z_i|
+
+    Otherwise, the p-norm of z follows the formula,
+
+    .. math::
+
+        ||z||_{p}=(\sum_{i=1}^{m}|z_i|^p)^{\\frac{1}{p}}
+
+    Args:
+        x (Variable): 1-D to 6-D Tensor, its data type is float32 or float64.
+        y (Variable): 1-D to 6-D Tensor, its data type is float32 or float64.
+        p (float, optional): The norm to be computed, its data type is float32 or float64. Default: 2.
+
+    Returns:
+        Variable: Tensor that is the p-norm of (x - y).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+
+            with fluid.dygraph.guard():
+                x = fluid.dygraph.to_variable(np.array([[3, 3],[3, 3]]).astype(np.float32))
+                y = fluid.dygraph.to_variable(np.array([[3, 3],[3, 1]]).astype(np.float32))
+                out = paddle.dist(x, y, 0)
+                print(out.numpy()) # out = [1.]
+
+                out = paddle.dist(x, y, 2)
+                print(out.numpy()) # out = [2.]
+
+                out = paddle.dist(x, y, float("inf"))
+                print(out.numpy()) # out = [2.]
+
+                out = paddle.dist(x, y, float("-inf"))
+                print(out.numpy()) # out = [0.]
+    """
+    check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'dist')
+    check_variable_and_dtype(y, 'dtype', ['float32', 'float64'], 'dist')
+    check_type(p, 'p', (float, int), 'dist')
+    helper = LayerHelper("dist", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+
+    inputs = {"X": [x], "Y": [y]}
+    outputs = {'Out': [out]}
+    attrs = {"p": float(p)}
+    helper.append_op(
+        type='dist', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    return out