diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 78e86c12cb4bbb10de52cc2aa46a7d0ff6ce7cd3..5722993aec07fa3b9cf2bffc72203f7d8a6e306f 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -21,7 +21,7 @@ add_subdirectory(ops)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos)
+set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
 get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 # keep this message for debug, remove it later if needless
 message(STATUS "All standard pten kernels: ${pten_kernels}")
diff --git a/paddle/pten/common/layout.h b/paddle/pten/common/layout.h
index cfcc4f76693d9b87b89de63bb1418c3a40b12a0d..57aa8863cb4d3a3f225d0ac7e98ffbaa8fc18953 100644
--- a/paddle/pten/common/layout.h
+++ b/paddle/pten/common/layout.h
@@ -27,6 +27,8 @@ enum class DataLayout {
   NHWC,
   NCHW,
   MKLDNN,
+  SPARSE_COO,
+  SPARSE_CSR,
   NUM_DATA_LAYOUTS,
   // See Note [ Why we need ALL in basic kernel key member? ]
   ALL_LAYOUT = UNDEFINED,
@@ -64,6 +66,10 @@ inline DataLayout StringToDataLayout(const std::string& str) {
     return DataLayout::kAnyLayout;
   } else if (s == "MKLDNNLAYOUT") {
     return DataLayout::kMKLDNN;
+  } else if (s == "SPARSE_COO") {
+    return DataLayout::SPARSE_COO;
+  } else if (s == "SPARSE_CSR") {
+    return DataLayout::SPARSE_CSR;
   } else {
     PD_THROW("Unknown data layout type string: ", s, ".");
   }
@@ -79,6 +85,10 @@ inline std::string DataLayoutToString(const DataLayout& layout) {
       return "Undefined(AnyLayout)";
     case DataLayout::kMKLDNN:
       return "MKLDNN";
+    case DataLayout::SPARSE_COO:
+      return "SPARSE_COO";
+    case DataLayout::SPARSE_CSR:
+      return "SPARSE_CSR";
     default:
       PD_THROW("Unknown Data Layout type ", static_cast<int>(layout), ".");
   }
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index f83b80fca1f480e13dfbe89c34f68cb4f4249cf9..ab6a99319732d4c07f6224b181d05e7efd696aba 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -22,6 +22,8 @@ cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context)
 cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS pten_enforce)
 cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector)
 cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector)
+cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
+cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)
 cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils tensor_meta tensor_base)
 cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base )
 
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 85fe2f22836e61bf7348fa0bbe36c9efb2b02331..d48572db5a22848f22d172c26f4d79e8c9c820dc 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -20,6 +20,8 @@
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_context.h"
 #include "paddle/pten/core/kernel_def.h"
+#include "paddle/pten/core/sparse_coo_tensor.h"
+#include "paddle/pten/core/sparse_csr_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/core/enforce.h"
@@ -213,6 +215,14 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+
+  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
+
+  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
   // TODO(chenweihang): adapt SelectedRows
   // PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor);
 
@@ -234,6 +244,12 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+
+  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
+
+  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
   // TODO(chenweihang): adapt SelectedRows
   // PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRowsTensor);
 
diff --git a/paddle/pten/core/sparse_coo_tensor.cc b/paddle/pten/core/sparse_coo_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0966fe1d80ff9975db073433c78d9f356dba5877
--- /dev/null
+++ b/paddle/pten/core/sparse_coo_tensor.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/sparse_coo_tensor.h"
+
+namespace pten {
+
+SparseCooTensor::SparseCooTensor(const DenseTensor& non_zero_indices,
+                                 const DenseTensor& non_zero_elements,
+                                 const DDim& dims)
+    : non_zero_indices_(non_zero_indices),
+      non_zero_elements_(non_zero_elements),
+      coalesced_(false),
+      dims_(dims) {}
+
+SparseCooTensor::SparseCooTensor(DenseTensor&& non_zero_indices,
+                                 DenseTensor&& non_zero_elements,
+                                 const DDim& dims)
+    : non_zero_indices_(non_zero_indices),
+      non_zero_elements_(non_zero_elements),
+      coalesced_(false),
+      dims_(dims) {}
+
+SparseCooTensor::SparseCooTensor(const SparseCooTensor& other)
+    : non_zero_indices_(other.non_zero_indices_),
+      non_zero_elements_(other.non_zero_elements_),
+      dims_(other.dims_) {
+  this->coalesced_ = other.coalesced_;
+}
+
+SparseCooTensor SparseCooTensor::operator=(const SparseCooTensor& other) {
+  this->dims_ = other.dims_;
+  this->non_zero_indices_ = other.non_zero_indices_;
+  this->non_zero_elements_ = other.non_zero_elements_;
+  this->coalesced_ = other.coalesced_;
+  return *this;
+}
+
+int64_t SparseCooTensor::nnz() const {
+  const auto indices_dims = non_zero_indices_.dims();
+  if (indices_dims.size() == 0) {
+    return 0;
+  } else if (indices_dims.size() == 1) {
+    return indices_dims[0];
+  } else {
+    return indices_dims[1];
+  }
+}
+
+void SparseCooTensor::Resize(const DDim& dense_dims,
+                             const int64_t sparse_dim,
+                             const int64_t non_zero_num) {
+  PADDLE_ENFORCE_GE(non_zero_num,
+                    this->nnz(),
+                    paddle::platform::errors::InvalidArgument(
+                        "the non_zero_num must be greater than or equal to the "
+                        "origin non_zero_num."));
+  PADDLE_ENFORCE_GE(sparse_dim,
+                    1,
+                    paddle::platform::errors::InvalidArgument(
+                        "the sparse_dim must be greater than or equal 1."));
+  PADDLE_ENFORCE_LE(
+      sparse_dim,
+      dense_dims.size(),
+      paddle::platform::errors::InvalidArgument(
+          "the sparse_dim must be less than or equal dense_dims."));
+
+  DDim indices_dims = pten::framework::make_ddim({sparse_dim, non_zero_num});
+  auto dense_dim = dense_dims.size() - sparse_dim;
+  DDim values_dims;
+  if (dense_dim) {
+    std::vector<int64_t> dense_dim_vec(dense_dim + 1);
+    dense_dim_vec[0] = non_zero_num;
+    memcpy(&dense_dim_vec[1],
+           dense_dims.Get() + sparse_dim,
+           dense_dim * sizeof(dense_dims[0]));
+    values_dims = pten::framework::make_ddim(dense_dim_vec);
+  } else {
+    values_dims = pten::framework::make_ddim({non_zero_num});
+  }
+
+  this->non_zero_indices_.Resize(indices_dims);
+  this->non_zero_elements_.Resize(values_dims);
+}
+
+void SparseCooTensor::SetMember(const DenseTensor& non_zero_indices,
+                                const DenseTensor& non_zero_elements,
+                                const DDim& dims,
+                                const bool coalesced) {
+  this->non_zero_indices_ = non_zero_indices;
+  this->non_zero_elements_ = non_zero_elements;
+  this->dims_ = dims_;
+  this->coalesced_ = coalesced;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/sparse_coo_tensor.h b/paddle/pten/core/sparse_coo_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff4241d3287ec1483f31bf8c9e1cf44fa1d88a1e
--- /dev/null
+++ b/paddle/pten/core/sparse_coo_tensor.h
@@ -0,0 +1,180 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+/// \brief The SparseCooTensor uses two DenseTensors to represent
+/// the non zero elements and the indices of non zero elements of
+/// original DenseTensor.
+/// where non_zero_elements_ represents the non zero elements of original
+/// DenseTensor.
+/// non_zero_indices_ represents the indices of non zero elements in original
+/// DenseTensor.
+class SparseCooTensor : public TensorBase,
+                        public TypeInfoTraits<TensorBase, SparseCooTensor> {
+ public:
+  /// \brief Create the sparse coo tensor
+  /// \param non_zero_indices The indices of non zero elements in original dense
+  /// tensor.
+  /// \param non_zero_elements The non zero elements of original dense tensor.
+  /// \param dims The dims of original dense tensor.
+  SparseCooTensor(const DenseTensor& non_zero_indices,
+                  const DenseTensor& non_zero_elements,
+                  const DDim& dims);
+
+  /// \brief Create the sparse coo tensor
+  /// \param non_zero_indices The indices of non zero elements in original dense
+  /// tensor.
+  /// \param non_zero_elements The non zero elements of original dense tensor.
+  /// \param dims The dims of original dense tensor.
+  SparseCooTensor(DenseTensor&& non_zero_indices,
+                  DenseTensor&& non_zero_elements,
+                  const DDim& dims);
+
+  /// \brief SparseCooTensor shallow copy constructor.
+  SparseCooTensor(const SparseCooTensor& other);
+
+  /// \brief move constructor
+  SparseCooTensor(SparseCooTensor&& other);
+
+  /// \brief SparseCooTensor shallow copy assignment.
+  SparseCooTensor operator=(const SparseCooTensor& other);
+
+  /// \brief Destroy the tensor object and release exclusive resources.
+  virtual ~SparseCooTensor() = default;
+
+  /// \brief Returns the indices of non zero elemetns in original dense tensor.
+  /// \return The indices of non zero elemetns in original dense tensor.
+  const DenseTensor& non_zero_indices() const { return non_zero_indices_; }
+
+  /// \brief Returns the non zero elemetns in original dense tensor.
+  /// \return The non zero elemetns in original dense tensor.
+  const DenseTensor& non_zero_elements() const { return non_zero_elements_; }
+
+  /// \brief Returns whether the indices has coalesced
+  /// \return whether the indices has coalesced
+  bool coalesced() const { return coalesced_; }
+
+  /// \brief Set the coalesced
+  /// \param coalesced whether the indices has coalesced
+  void SetCoalesced(const bool coalesced) { coalesced_ = coalesced; }
+
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() { return "SparseCooTensor"; }
+
+  /// \brief Returns the total number of non zero elements in original
+  /// DenseTensor
+  int64_t nnz() const;
+
+  /// \brief Return the number of elements contained in original dense tensor
+  /// \return The number of elements contained in original dense tensor
+  int64_t numel() const { return product(dims_); }
+
+  /// \brief Returns the dims of the original dense tensor.
+  /// \return The dims of the original dense tensor.
+  const DDim& dims() const noexcept override { return dims_; }
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  DataType dtype() const noexcept override {
+    return non_zero_elements_.dtype();
+  }
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  DataLayout layout() const { return DataLayout::SPARSE_COO; }
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  const Place& place() const override { return non_zero_elements_.place(); }
+
+  /// \brief Test whether the non_zero_elements_ metadata is valid.
+  /// \return Whether the non_zero_elements_ metadata is valid.
+  bool valid() const noexcept { return non_zero_elements_.valid(); }
+
+  /// \brief Test whether the non_zero_elements_ storage is allocated.
+  /// return Whether the non_zero_elements_ storage is allocated.
+  bool initialized() const override { return non_zero_elements_.initialized(); }
+
+  /// \brief resize sparse coo tensor.
+  /// \param dense_dims The dims of original dense tensor.
+  /// \param sparse_dim number of sparse dimensions
+  /// \param non_zero_num The total number of non zero element
+  void Resize(const DDim& dense_dim,
+              const int64_t sparse_dim,
+              const int64_t non_zero_num);
+
+  /// \brief set the member of sparse coo tensor.
+  /// \param non_zero_indices The indices of non zero elements in original dense
+  /// tensor.
+  /// \param non_zero_elements The non zero elements of original dense tensor.
+  /// \param dims The dims of original dense tensor.
+  /// \param coalesced whether the indices has coalesced.
+  void SetMember(const DenseTensor& non_zero_indices,
+                 const DenseTensor& non_zero_elements,
+                 const DDim& dims,
+                 const bool coalesced = false);
+
+  /// \brief Get a mutable pointer of non_zero_indices_.
+  /// return a mutable pointer of non_zero_indices_.
+  DenseTensor* mutable_non_zero_indices() { return &non_zero_indices_; }
+
+  /// \brief Get a mutable pointer of non_zero_elements.
+  /// return a mutable pointer of non_zero_elements.
+  DenseTensor* mutable_non_zero_elements() { return &non_zero_elements_; }
+
+ private:
+  // save the indices of non zero elements in original dense tensor
+  DenseTensor non_zero_indices_;
+  // save the non zero elements of original dense tensor
+  DenseTensor non_zero_elements_;
+  /// whether the indices has coalesced
+  bool coalesced_ = false;
+  // save the number of non zero elements in each batch
+  DDim dims_;
+  /* --------------------------- */
+  /*   example: non zero element is scalar */
+  /* --------------------------- */
+  /*
+     dense_x = [[0, 1, 0, 0],
+                [2, 0, 0, 3],
+                [0, 0, 4, 0],
+                [0, 5, 0, 6]]
+     dims_ = (4, 4)
+     non_zero_elements_ = [1, 2, 3, 4, 5 ,6]
+     non_zero_indices_ = [[0, 1, 1, 2, 3, 3],
+                          [1, 0, 3, 2, 1, 3]]
+   */
+  /* --------------------------- */
+  /*   example: non zero element is tensor */
+  /* --------------------------- */
+  /*
+     dense_x = [[0, 1, 0, 0],
+                [0, 0, 0, 0],
+                [0, 0, 4, 0],
+                [0, 0, 0, 0]]
+     dims_ = (4, 4)
+     non_zero_elements_ = [[0, 1, 0, 0], [0, 0, 4, 0]]
+     non_zero_indices_ = [0, 2],
+   */
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/sparse_csr_tensor.cc b/paddle/pten/core/sparse_csr_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cb9163cb6f6742412b99daac57154ccf6504869
--- /dev/null
+++ b/paddle/pten/core/sparse_csr_tensor.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/sparse_csr_tensor.h"
+
+namespace pten {
+
+inline void check_shape(const DDim& dims) {
+  bool valid = dims.size() == 2 || dims.size() == 3;
+
+  PADDLE_ENFORCE(valid,
+                 paddle::platform::errors::InvalidArgument(
+                     "the SparseCsrTensor only support 2-D Tensor."));
+}
+#define Check(non_zero_crows, non_zero_cols, non_zero_elements, dims)          \
+  {                                                                            \
+    check_shape(dims);                                                         \
+    PADDLE_ENFORCE_EQ(dims.size(),                                             \
+                      2,                                                       \
+                      paddle::platform::errors::InvalidArgument(               \
+                          "the SparseCsrTensor only support 2-D Tensor."));    \
+    PADDLE_ENFORCE_EQ(                                                         \
+        non_zero_cols.place(),                                                 \
+        non_zero_crows.place(),                                                \
+        paddle::platform::errors::InvalidArgument(                             \
+            "non_zero_crows and non_zero_cols must have the same place."));    \
+    PADDLE_ENFORCE_EQ(                                                         \
+        non_zero_cols.place(),                                                 \
+        non_zero_elements.place(),                                             \
+        paddle::platform::errors::InvalidArgument(                             \
+            "non_zero_cols and non_zero_elements must have the same place.")); \
+  }
+
+SparseCsrTensor::SparseCsrTensor(const DenseTensor& non_zero_crows,
+                                 const DenseTensor& non_zero_cols,
+                                 const DenseTensor& non_zero_elements,
+                                 const DDim& dims)
+    : non_zero_crows_(non_zero_crows),
+      non_zero_cols_(non_zero_cols),
+      non_zero_elements_(non_zero_elements),
+      dims_(dims) {
+  Check(non_zero_crows_, non_zero_cols_, non_zero_elements_, dims_);
+}
+
+SparseCsrTensor::SparseCsrTensor(const SparseCsrTensor& other)
+    : non_zero_crows_(other.non_zero_crows_),
+      non_zero_cols_(other.non_zero_cols_),
+      non_zero_elements_(other.non_zero_elements_),
+      dims_(other.dims_) {}
+
+SparseCsrTensor& SparseCsrTensor::operator=(const SparseCsrTensor& other) {
+  this->dims_ = other.dims();
+  this->non_zero_crows_ = other.non_zero_crows();
+  this->non_zero_cols_ = other.non_zero_cols();
+  this->non_zero_elements_ = other.non_zero_elements();
+  return *this;
+}
+
+void SparseCsrTensor::Resize(const DDim& dense_dims,
+                             const int64_t non_zero_num) {
+  PADDLE_ENFORCE(this->initialized(),
+                 paddle::platform::errors::InvalidArgument(
+                     "the SparseCsrTensor must be initialized when call Resize "
+                     "function."));
+  check_shape(dense_dims);
+
+  int64_t crows_size = dense_dims[0] + 1;
+  if (dense_dims.size() == 3) {
+    // batch_size = dims[0]
+    crows_size = dense_dims[0] * (dense_dims[1] + 1);
+  }
+
+  DDim crows_dims = pten::framework::make_ddim({crows_size});
+  this->non_zero_crows_.Resize(crows_dims);
+
+  DDim col_dims = pten::framework::make_ddim({non_zero_num});
+  this->non_zero_cols_.Resize(col_dims);
+  this->non_zero_elements_.Resize(col_dims);
+}
+
+void SparseCsrTensor::SetMember(const DenseTensor& non_zero_crows,
+                                const DenseTensor& non_zero_cols,
+                                const DenseTensor& non_zero_elements,
+                                const DDim& dims) {
+  Check(non_zero_crows, non_zero_cols, non_zero_elements, dims);
+  this->non_zero_crows_ = non_zero_crows;
+  this->non_zero_cols_ = non_zero_cols;
+  this->non_zero_elements_ = non_zero_elements;
+  this->dims_ = dims;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/sparse_csr_tensor.h b/paddle/pten/core/sparse_csr_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a3322ba17c84d34704e8267d87c68a1954e7ae9
--- /dev/null
+++ b/paddle/pten/core/sparse_csr_tensor.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+class CompatibleDenseTensorUtils;
+
+/// \brief The SparseCsrTensor uses three 1-D DenseTensors to represent
+/// the row index , column index and non zero elements of the original
+/// DenseTensor.
+/// where non_zero_crows_ represents the compressed row index,
+/// non_zero_cols_ represents the column index of non zero elements in original
+/// DenseTensor,
+/// non_zero_elements_ represents the non zero elements of original DenseTensor.
+class SparseCsrTensor : public TensorBase,
+                        public TypeInfoTraits<TensorBase, SparseCsrTensor> {
+ public:
+  /// \brief Because sparse csr tensor is a resource handle, we provide a
+  /// default
+  /// move constructor to support move semantics.
+  SparseCsrTensor(SparseCsrTensor&& other) = default;
+
+  /// \brief SparseCsrTensor shallow copy constructor.
+  SparseCsrTensor(const SparseCsrTensor& other);
+
+  /// \brief create the sparse csr tensor.
+  /// \param non_zero_crows The compresessed row index of non zero elements in
+  /// original dense tensor.
+  /// \param non_zero_cols The column index of non zero elements in original
+  /// dense tensor.
+  /// \param non_zero_elements The non zero elements of original dense tensor.
+  /// \param dims The dims of original dense tensor.
+  SparseCsrTensor(const DenseTensor& non_zero_crows,
+                  const DenseTensor& non_zero_cols,
+                  const DenseTensor& non_zero_elements,
+                  const DDim& dims);
+
+  /// \brief SparseCsrTensor shallow copy assignment.
+  SparseCsrTensor& operator=(const SparseCsrTensor& other);
+
+  /// \brief Destroy the tensor object and release exclusive resources.
+  virtual ~SparseCsrTensor() = default;
+
+ public:
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() { return "SparseCsrTensor"; }
+
+  /// \brief Returns the compressed row index of non zero elemetns in original
+  /// dense tensor.
+  /// \return The compressed row index of non zero elemetns in original dense
+  /// tensor.
+  const DenseTensor& non_zero_crows() const { return non_zero_crows_; }
+
+  /// \brief Returns the column index of non zero elemetns in original dense
+  /// tensor.
+  /// \return The column index of non zero elemetns in original dense tensor.
+  const DenseTensor& non_zero_cols() const { return non_zero_cols_; }
+
+  /// \brief Returns the non zero elemetns in original dense tensor.
+  /// \return The non zero elemetns in original dense tensor.
+  const DenseTensor& non_zero_elements() const { return non_zero_elements_; }
+
+  /// \brief Return the number of elements contained in original dense tensor
+  /// \return The number of elements contained in original dense tensor
+  int64_t numel() const { return product(dims_); }
+
+  /// \brief Returns the dims of the original dense tensor.
+  /// \return The dims of the original dense tensor.
+  const DDim& dims() const noexcept override { return dims_; }
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  DataType dtype() const noexcept override {
+    return non_zero_elements_.dtype();
+  }
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  DataLayout layout() const { return DataLayout::SPARSE_CSR; }
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  const Place& place() const override { return non_zero_elements_.place(); }
+
+  /// \brief Test whether the non_zero_elements_ metadata is valid.
+  /// \return Whether the non_zero_elements_ metadata is valid.
+  bool valid() const noexcept { return non_zero_elements_.valid(); }
+
+  /// \brief Test whether the non_zero_elements_ storage is allocated.
+  /// return Whether the non_zero_elements_ storage is allocated.
+  bool initialized() const override { return non_zero_elements_.initialized(); }
+
+  /// \brief resize sparse csr tensor.
+  /// \param dense_dims The dims of original dense tensor.
+  /// \param non_zero_num The total number of non zero element
+  void Resize(const DDim& dense_dims, const int64_t non_zero_num);
+
+  /// \brief set the member of sparse csr tensor.
+  /// \param non_zero_crows The compresessed row index of non zero elements in
+  /// original dense tensor.
+  /// \param non_zero_cols The column index of non zero elements in original
+  /// dense tensor.
+  /// \param non_zero_elements The non zero elements of original dense tensor.
+  /// \param dims The dims of original dense tensor.
+  void SetMember(const DenseTensor& non_zero_crows,
+                 const DenseTensor& non_zero_cols,
+                 const DenseTensor& non_zero_elements,
+                 const DDim& dims);
+
+  /// \brief Get a mutable pointer of non_zero_crows.
+  /// return a mutable pointer of non_zero_crows.
+  DenseTensor* mutable_non_zero_crows() { return &non_zero_crows_; }
+
+  /// \brief Get a mutable pointer of non_zero_cols.
+  /// return a mutable pointer of non_zero_cols.
+  DenseTensor* mutable_non_zero_cols() { return &non_zero_cols_; }
+
+  /// \brief Get a mutable pointer of non_zero_elements.
+  /// return a mutable pointer of non_zero_elements.
+  DenseTensor* mutable_non_zero_elements() { return &non_zero_elements_; }
+
+ private:
+  // save the compressed rows information of non zero elements
+  DenseTensor non_zero_crows_;
+  // save the columns information of non zero elements
+  DenseTensor non_zero_cols_;
+  // save the non zero elements
+  DenseTensor non_zero_elements_;
+  // save the number of non zero elements in each batch
+  DDim dims_;
+  /* --------------------------- */
+  /*   example: 2-D Tensor */
+  /* --------------------------- */
+  /*
+     x = [[0, 1, 0, 0],
+          [2, 0, 0, 3],
+          [0, 0, 4, 0],
+          [0, 5, 0, 6]]
+     dims_ = (4, 4)
+     non_zero_elements_ = [1, 2, 3, 4, 5 ,6]
+     non_zero_crows_ = [0, 1, 3, 4, 6]
+     non_zero_cols_ = [1, 0, 3, 2, 1, 3]
+   */
+
+  /* --------------------------- */
+  /*   example: 3-D Tensor */
+  /*   the non zero elements of different batch will be concat together */
+  /* --------------------------- */
+  /*
+     x = [[[0, 1, 0, 0],
+          [2, 0, 0, 3],
+          [0, 0, 4, 0],
+          [0, 5, 0, 6]],
+         [[0, 1, 0, 0],
+          [2, 0, 0, 3],
+          [0, 0, 4, 0],
+          [0, 5, 0, 0]]]
+     dims_ = (2, 4, 4)
+     non_zero_elements_ = [1, 2, 3, 4, 5 ,6, 1, 2, 3, 4, 5]
+     non_zero_crows_ = [0, 1, 3, 4, 6, 0, 1, 2, 4, 5]
+     non_zero_cols_ = [1, 0, 3, 2, 1, 3, 1, 0, 3, 2, 1]
+   */
+};
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/copy_kernel.h b/paddle/pten/kernels/copy_kernel.h
index a481908892e9b7d03f5b0e87051122218532dcef..b2ca18e4d28a3fb36e70d85523640f53be12c109 100644
--- a/paddle/pten/kernels/copy_kernel.h
+++ b/paddle/pten/kernels/copy_kernel.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/sparse_csr_tensor.h"
 
 namespace pten {
 
@@ -24,4 +25,10 @@ void Copy(const Context& dev_ctx,
           bool blocking,
           DenseTensor* dst);
 
+template <typename Context>
+void CopySparse(const Context& dev_ctx,
+                const SparseCsrTensor& src,
+                bool blocking,
+                SparseCsrTensor* dst);
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
index d2578723158317f485f0e93a1a5b93477db5df04..f540f96ab257dae445dced001ef10723d853bfaf 100644
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -215,7 +215,25 @@ void Copy(const Context& dev_ctx,
   }
 }
 
+template <typename Context>
+void CopySparse(const Context& dev_ctx,
+                const SparseCsrTensor& src,
+                bool blocking,
+                SparseCsrTensor* dst) {
+  Copy(dev_ctx, src.non_zero_crows(), blocking, dst->mutable_non_zero_crows());
+  Copy(dev_ctx, src.non_zero_cols(), blocking, dst->mutable_non_zero_cols());
+  Copy(dev_ctx,
+       src.non_zero_elements(),
+       blocking,
+       dst->mutable_non_zero_elements());
+}
+
 }  // namespace pten
 
 PT_REGISTER_GENERAL_KERNEL(
     copy, GPU, ALL_LAYOUT, pten::Copy<pten::GPUContext>, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(copy_sparse,
+                           GPU,
+                           ALL_LAYOUT,
+                           pten::CopySparse<pten::GPUContext>,
+                           ALL_DTYPE) {}
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index 27a0173ef6f1fc8654fdbe4ef7b585f3ec3d7651..60a0ca285412fe01e0b740c89fdd69f4e16ad3df 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -3,6 +3,8 @@ cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
 cc_test(test_convert_utils SRCS test_convert_utils.cc DEPS convert_utils)
 cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scale_kernel)
+cc_test(test_sparse_coo_tensor SRCS test_sparse_coo_tensor.cc DEPS dense_tensor sparse_coo_tensor)
+cc_test(test_sparse_csr_tensor SRCS test_sparse_csr_tensor.cc DEPS dense_tensor sparse_csr_tensor)
 cc_test(test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos)
 cc_test(test_pten_device_context SRCS test_device_context.cc DEPS pten_context cpu_context)
 
diff --git a/paddle/pten/tests/core/test_sparse_coo_tensor.cc b/paddle/pten/tests/core/test_sparse_coo_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fdec4910e8273edfeb0e719dddd11038300b29c4
--- /dev/null
+++ b/paddle/pten/tests/core/test_sparse_coo_tensor.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/sparse_coo_tensor.h"
+#include "paddle/pten/tests/core/allocator.h"
+
+namespace pten {
+namespace tests {
+
+TEST(sparse_coo_tensor, construct) {
+  pten::CPUPlace cpu;
+  auto dense_dims = pten::framework::make_ddim({3, 3});
+  std::vector<float> non_zero_data = {1.0, 2.0, 3.0};
+  std::vector<int64_t> indices_data = {0, 1, 2, 0, 2, 1};
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto* alloc = fancy_allocator.get();
+  auto indices_dims =
+      pten::framework::make_ddim({2, static_cast<int>(non_zero_data.size())});
+  DenseTensorMeta indices_meta(DataType::INT64, indices_dims, DataLayout::NCHW);
+  DenseTensor indices(alloc, indices_meta);
+  memcpy(indices.mutable_data<int64_t>(cpu),
+         &indices_data[0],
+         indices_data.size() * sizeof(int64_t));
+
+  auto elements_dims =
+      pten::framework::make_ddim({static_cast<int>(non_zero_data.size())});
+  DenseTensorMeta elements_meta(
+      DataType::FLOAT32, elements_dims, DataLayout::NCHW);
+  DenseTensor elements(alloc, elements_meta);
+
+  memcpy(elements.mutable_data<float>(cpu),
+         &non_zero_data[0],
+         non_zero_data.size() * sizeof(float));
+
+  SparseCooTensor sparse(indices, elements, dense_dims);
+
+  CHECK(sparse.initialized() == true);
+  CHECK_EQ(sparse.nnz(), static_cast<int64_t>(non_zero_data.size()));
+  CHECK_EQ(sparse.numel(), 9);
+  CHECK(sparse.dims() == dense_dims);
+  CHECK(sparse.dtype() == DataType::FLOAT32);
+  CHECK(sparse.layout() == DataLayout::SPARSE_COO);
+  CHECK(sparse.place() == paddle::platform::CPUPlace());
+}
+
+TEST(sparse_coo_tensor, other_function) {
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto* alloc = fancy_allocator.get();
+  auto dense_dims = pten::framework::make_ddim({4, 4});
+  const int non_zero_num = 2;
+  auto indices_dims = pten::framework::make_ddim({2, non_zero_num});
+  DenseTensorMeta indices_meta(DataType::INT64, indices_dims, DataLayout::NCHW);
+  DenseTensor indices(alloc, indices_meta);
+
+  auto elements_dims = pten::framework::make_ddim({non_zero_num});
+  DenseTensorMeta elements_meta(
+      DataType::FLOAT32, elements_dims, DataLayout::NCHW);
+  DenseTensor elements(alloc, elements_meta);
+
+  SparseCooTensor coo(indices, elements, dense_dims);
+  CHECK(coo.initialized());
+  CHECK_EQ(coo.dims(), dense_dims);
+
+  // Test Resize
+  auto dense_dims_3d = pten::framework::make_ddim({2, 4, 4});
+  coo.Resize(dense_dims_3d, 1, 3);
+  CHECK_EQ(coo.nnz(), 3);
+
+  // Test shallow_copy
+  SparseCooTensor coo2(coo);
+  CHECK(coo.dims() == coo2.dims());
+
+  // Test shallow_copy_assignment
+  SparseCooTensor coo3 = coo2;
+  CHECK(coo3.dims() == coo2.dims());
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_sparse_csr_tensor.cc b/paddle/pten/tests/core/test_sparse_csr_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4d498cdf865bbdbf4502412a11ce3194cc80d43
--- /dev/null
+++ b/paddle/pten/tests/core/test_sparse_csr_tensor.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/sparse_csr_tensor.h"
+#include "paddle/pten/tests/core/allocator.h"
+
+namespace pten {
+namespace tests {
+
+TEST(sparse_csr_tensor, construct) {
+  pten::CPUPlace cpu;
+  auto dense_dims = pten::framework::make_ddim({3, 3});
+  std::vector<float> non_zero_data = {1.0, 2.0, 3.0};
+  std::vector<int64_t> crows_data = {0, 1, 1, 3};
+  std::vector<int64_t> cols_data = {1, 0, 2};
+
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto alloc = fancy_allocator.get();
+  // create non_zero_crows
+  auto crows_dims =
+      pten::framework::make_ddim({static_cast<int>(crows_data.size())});
+  DenseTensorMeta crows_meta(DataType::INT64, crows_dims, DataLayout::NCHW);
+  DenseTensor crows(alloc, crows_meta);
+  memcpy(crows.mutable_data<int64_t>(cpu),
+         &crows_data[0],
+         crows_data.size() * sizeof(int64_t));
+
+  // create non_zero_cols
+  auto cols_dims =
+      pten::framework::make_ddim({static_cast<int>(cols_data.size())});
+  DenseTensorMeta cols_meta(DataType::INT64, cols_dims, DataLayout::NCHW);
+  DenseTensor cols(alloc, cols_meta);
+  memcpy(cols.mutable_data<int64_t>(cpu),
+         &cols_data[0],
+         cols_data.size() * sizeof(int64_t));
+
+  // create non_zero_elements
+  auto elements_dims =
+      pten::framework::make_ddim({static_cast<int>(non_zero_data.size())});
+  DenseTensorMeta elements_meta(
+      DataType::FLOAT32, elements_dims, DataLayout::NCHW);
+  DenseTensor elements(alloc, elements_meta);
+  memcpy(elements.mutable_data<float>(cpu),
+         &non_zero_data[0],
+         non_zero_data.size() * sizeof(float));
+
+  SparseCsrTensor sparse(crows, cols, elements, dense_dims);
+
+  CHECK_EQ(sparse.non_zero_cols().numel(),
+           static_cast<int64_t>(non_zero_data.size()));
+  CHECK_EQ(sparse.numel(), 9);
+  CHECK(sparse.dims() == dense_dims);
+  CHECK(sparse.dtype() == DataType::FLOAT32);
+  CHECK(sparse.layout() == DataLayout::SPARSE_CSR);
+  CHECK(sparse.place() == paddle::platform::CPUPlace());
+  CHECK(sparse.initialized() == true);
+}
+
+TEST(sparse_csr_tensor, other_function) {
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto alloc = fancy_allocator.get();
+  auto dense_dims = pten::framework::make_ddim({4, 4});
+  auto crows_dims = pten::framework::make_ddim({dense_dims[0] + 1});
+  DenseTensorMeta crows_meta(DataType::INT64, crows_dims, DataLayout::NCHW);
+  DenseTensor crows(alloc, crows_meta);
+
+  const int64_t non_zero_num = 5;
+  auto cols_dims = pten::framework::make_ddim({non_zero_num});
+  DenseTensorMeta cols_meta(DataType::INT64, cols_dims, DataLayout::NCHW);
+  DenseTensor cols(alloc, cols_meta);
+  DenseTensorMeta values_meta(DataType::FLOAT32, cols_dims, DataLayout::NCHW);
+  DenseTensor values(alloc, values_meta);
+
+  SparseCsrTensor csr(crows, cols, values, dense_dims);
+  CHECK(csr.initialized());
+  CHECK_EQ(csr.dims(), dense_dims);
+
+  // Test Resize
+  auto dense_dims_3d = pten::framework::make_ddim({2, 4, 4});
+  csr.Resize(dense_dims_3d, 2);
+  CHECK_EQ(csr.non_zero_cols().numel(), 2);
+
+  // Test shallow_copy
+  SparseCsrTensor csr2(csr);
+  CHECK(csr.dims() == csr2.dims());
+
+  // Test shallow_copy_assignment
+  SparseCsrTensor csr3 = csr2;
+  CHECK(csr3.dims() == csr2.dims());
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index d38e8d1193beffeecd35c19fafdf47c10aaf8927..ca14c55175430d9133caac6b2c58e421f4c092fd 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -75,7 +75,7 @@ class FusedMultiHeadAttention(Layer):
                  embed_dim,
                  num_heads,
                  dropout_rate=0.5,
-                 attn_dropout_rate=0.5,
+                 attn_dropout_rate=None,
                  kdim=None,
                  vdim=None,
                  normalize_before=False,