diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index d1c5983a3702f39a679983638a32e9588e16ff4a..0ed1a198c916dbce2c3bc48cc77e41a7bfa9d2c3 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -114,6 +114,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTensorAndVector& tensors) {
   CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
   CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
 }
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
index a411504fa4900d0a0f047e3d2c13a047fdd03888..815e3bd6cd14f11eac3044e67322ad8975f1bf5d 100644
--- a/paddle/fluid/eager/nan_inf_utils.h
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -31,7 +31,8 @@ using TupleOfFourTensors = std::tuple<Tensor, Tensor, Tensor, Tensor>;
 using TupleOfFiveTensors = std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>;
 using TupleOfSixTensors =
     std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>;
-using TupleOfTensorAndVector = std::tuple<Tensor, std::vector<Tensor>>;
+using TupleOfTensorAndVector =
+    std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>>;
 
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
 
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 7fc19d6913f83af32bb1fad1fc828043c1ca40e6..95f841f7797b9a47d01c57246277aa718508f3ae 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -41,6 +41,10 @@ class EinsumOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra()
         .AsIntermediate();
 
+    AddOutput("XShape", "(Tensor), The cache of the x_shape of: A and B.")
+        .AsDuplicable()
+        .AsExtra()
+        .AsIntermediate();
     AddAttr<std::string>("equation",
                          "(string) A einsum equation. such as `ij,jk->ik`"
                          "There must have `->` and the number of operands in "
@@ -59,8 +63,8 @@ class EinsumGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto x_name = "Operands";
     auto x_grad_name = framework::GradVarName(x_name);
-    ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim(x_name));
-    ctx->ShareAllLoD(x_name, x_grad_name);
+    ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim("Operands"));
+    ctx->ShareAllLoD("Operands", x_grad_name);
   }
 
  protected:
@@ -79,8 +83,15 @@ class EinsumGradMaker : public framework::SingleGradOpMaker<T> {
 
   void Apply(GradOpPtr<T> retv) const override {
     retv->SetType("einsum_grad");
-    retv->SetInput("Operands", this->Input("Operands"));
-    retv->SetInput("InnerCache", this->Output("InnerCache"));
+    if (this->HasOutput("InnerCache")) {
+      retv->SetInput("InnerCache", this->Output("InnerCache"));
+    }
+    if (this->HasOutput("XShape")) {
+      // add if for compatibility.
+      retv->SetInput("Operands", this->Output("XShape"));  // for memory save.
+    } else {
+      retv->SetInput("Operands", this->Input("Operands"));
+    }
     retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     retv->SetAttrMap(this->Attrs());
     retv->SetOutput(framework::GradVarName("Operands"),
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index bc41a24c44562edde97b5552e127d56ab7462b04..072ab6fd68a1a6692fccf12d80fe3ee3beca8433 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -402,7 +402,8 @@ void EighInferMeta(const MetaTensor& x,
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out,
-                     std::vector<MetaTensor*> inner_cache) {
+                     std::vector<MetaTensor*> inner_cache,
+                     std::vector<MetaTensor*> xshape) {
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
@@ -439,6 +440,12 @@ void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
   VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape);
   out->set_dims(make_ddim(output_dims));
   out->set_dtype(inputs[0]->dtype());
+  for (size_t i = 0; i < xshape.size(); ++i) {
+    if (xshape[i] != nullptr) {
+      xshape[i]->set_dims(inputs[i]->dims());
+      xshape[i]->set_dtype(inputs[i]->dtype());
+    }
+  }
 }
 
 void ExpandInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a0cad3e628e3fe34d0167735b9dc90517d99eab7..f64d406e019ce755cb600dfa03a7dacb0bea1942 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -83,7 +83,8 @@ void EighInferMeta(const MetaTensor& x,
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out,
-                     std::vector<MetaTensor*> inner_cache);
+                     std::vector<MetaTensor*> inner_cache,
+                     std::vector<MetaTensor*> xshape);
 
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
diff --git a/paddle/phi/kernels/einsum_kernel.h b/paddle/phi/kernels/einsum_kernel.h
index 87df2b1c64a4a993b054b42926fa0508ed5e8a96..569cf7a55afd4a62783e6708c6b2d339d9fabd3c 100644
--- a/paddle/phi/kernels/einsum_kernel.h
+++ b/paddle/phi/kernels/einsum_kernel.h
@@ -29,6 +29,7 @@ void EinsumKernelRaw(const Context& dev_ctx,
                      const std::vector<const DenseTensor*>& inputs,
                      const std::string& equation,
                      DenseTensor* out,
-                     std::vector<DenseTensor*> cache);
+                     std::vector<DenseTensor*> inner_cache,
+                     std::vector<DenseTensor*> xshape);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index a72db326807f8eea865b197e6723924413e29a9b..a04185a0c53ed8a730a71dc230b89b2cb4dfc7a8 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -177,7 +177,6 @@ void EinsumGradKernel(const Context& dev_ctx,
       cache[0].ShareBufferWith(*(inner_cache[0]));
       cache[1].ShareBufferWith(*(inner_cache[1]));
     }
-
     EinsumKernelImpl<T, Context>(dev_ctx,
                                  all_labels,
                                  operands_for_A,
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index f3521c81ce46bf0ab1cd1c0cdfdfa8d837d7d7f6..43b2760b404f9898fa304bbd4a7e1c102116daa7 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -459,7 +459,7 @@ DenseTensor PerformContraction(
     }
     // reduction
     DenseTensor trans_t;
-    if (FLAGS_einsum_opt && use_cache && cache[operand_idx] != nullptr &&
+    if (use_cache && cache[operand_idx] != nullptr &&
         cache[operand_idx]->IsInitialized()) {
       trans_t.ShareBufferWith(*(cache[operand_idx]));
       VLOG(5) << "Cache Used!";
@@ -468,7 +468,7 @@ DenseTensor PerformContraction(
           dev_ctx, t, perm, all_labels, ellipsis, label2type);
       trans_t = PerformTranspose<T, Context>(
           dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
-      if (FLAGS_einsum_opt && cache[operand_idx] != nullptr)
+      if (cache[operand_idx] != nullptr)
         cache[operand_idx]->ShareBufferWith(trans_t);
     }
     auto mul_dims = GetShapeByType<int>(all_labels,
@@ -599,6 +599,11 @@ void EinsumKernelImpl(const Context& dev_ctx,
                                   out);
     // Reshape Procedure
   } else if (inputs.size() == 1) {
+    if (cache[0] != nullptr) {  // For compatibility, may be cache is nullptr if
+                                // loading the program from v2.3.0
+      (*cache[0]) = *(inputs[0]);  // ShareBuffer for backward, because backward
+                                   // we can only see cached tensor.
+    }
     auto reduce_A = PerformReduction<T, Context>(dev_ctx,
                                                  *inputs[0],
                                                  label2perms[0],
@@ -627,7 +632,8 @@ void EinsumKernelRaw(const Context& dev_ctx,
                      const std::vector<const DenseTensor*>& inputs,
                      const std::string& equation,
                      DenseTensor* out,
-                     std::vector<DenseTensor*> cache) {
+                     std::vector<DenseTensor*> cache,
+                     std::vector<DenseTensor*> xshape) {
   std::vector<char> tmp;
   // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output
   // may have nullptr and the cache.size() is not equal to inputs.size(). refer
diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc
index 5e45bcf97ce0e5d79e0fe17be9c8122daa5b60bd..4fd31c1a2d84211ed39ae372a34e54e6971d616a 100644
--- a/paddle/phi/ops/compat/einsum_sig.cc
+++ b/paddle/phi/ops/compat/einsum_sig.cc
@@ -18,7 +18,7 @@ namespace phi {
 
 KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache"});
+      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache", "XShape"});
 }
 
 KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py
index c36950b6922fef232221c10aeeaf0779c4182a99..e34d04be927cc98c79014547d3f0e0cb366c0b4d 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py
@@ -39,7 +39,9 @@ class TestEinsumBinary(OpTest):
             'Out':
             out,
             "InnerCache": [('cache_' + str(i), np.array([1.0]))
-                           for i in range(len(self.operands))]
+                           for i in range(len(self.operands))],
+            "XShape": [('xshape_' + str(i), np.array([1.0]))
+                       for i in range(len(self.operands))],
         }
 
     def init_input(self):
@@ -48,14 +50,13 @@ class TestEinsumBinary(OpTest):
             self.inputs.append(np.random.random(s).astype(t))
 
     def set_mandatory(self):
-        self.disable = False
         self.shapes = [(10, 10, 20), (20, 6)]
         self.types = [np.float64, np.float64]
         self.equation = "mij,jk->ki"
 
     def test_check_output(self):
         if not self.disable:
-            self.check_output(no_check_set=["InnerCache"])
+            self.check_output(no_check_set=["InnerCache", "XShape"])
 
     def test_grad(self):
         if not self.disable:
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 0cdced2cf9b84924c6ccc19cc191922e8408396b..34a1ead2cb497cf88157c53b7cd3774ab30cd42d 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -807,9 +807,9 @@ def gen_einsum_op(equation, *operands):
 
     if _in_legacy_dygraph():
         # dygraph
-        return _C_ops.einsum(operands, len(operands), 'equation', equation)[0]
+        return _C_ops.einsum(operands, len(operands), len(operands), 'equation',
+                             equation)[0]
 
-    # static graph
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
     check_type(equation, 'equation', str, 'einsum')
@@ -821,11 +821,16 @@ def gen_einsum_op(equation, *operands):
         helper.create_variable_for_type_inference(dtype=operands[0].dtype)
         for i in range(len(operands))
     ]
+    xshape = [
+        helper.create_variable_for_type_inference(dtype=operands[0].dtype)
+        for i in range(len(operands))
+    ]
     helper.append_op(type='einsum',
                      inputs={'Operands': operands},
                      outputs={
                          'Out': out,
-                         "InnerCache": caches
+                         "InnerCache": caches,
+                         "XShape": xshape
                      },
                      attrs=attrs)
     return out
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 2b8cff3543e768ce2ec428736e610dded4bbefb2..58c9ea6e5d2e87acd4665dad1a306656c894ebdb 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -603,7 +603,7 @@
 
 - api : einsum
   args : (Tensor[] x, str equation)
-  output : Tensor, Tensor[]{x.size()}
+  output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()}
   infer_meta :
     func : EinsumInferMeta
     param : [x, equation]
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 8e20b05110e71dfc9ffe0addb88452795418c2a8..2cdf22beeed964b43a86c1fd3aa0f69d252ad180 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1,3 +1,14 @@
+#- backward_api : einsum_grad
+
+  #forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache)
+  #args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation)
+  #output : Tensor[](x_grad){x.size()}
+  #infer_meta :
+    #func : UnchangedMultiInferMeta
+    #param : [x]
+  #kernel :
+    #func : einsum_grad
+
 - backward_api : abs_double_grad
   forward : abs_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor x, Tensor grad_x_grad)
@@ -616,12 +627,12 @@
     skip_transform : out_w, out_w_grad
 
 - backward_api : einsum_grad
-  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache)
-  args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation)
+  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache), Tensor[](x_shape)
+  args : (Tensor[] x_shape, Tensor[] inner_cache, Tensor out_grad, str equation)
   output : Tensor[](x_grad){x.size()}
   infer_meta :
     func : UnchangedMultiInferMeta
-    param : [x]
+    param : [x_shape]
   kernel :
     func : einsum_grad