diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index d1c5983a3702f39a679983638a32e9588e16ff4a..0ed1a198c916dbce2c3bc48cc77e41a7bfa9d2c3 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -114,6 +114,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTensorAndVector& tensors) {
   CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
   CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
 }
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
index a411504fa4900d0a0f047e3d2c13a047fdd03888..815e3bd6cd14f11eac3044e67322ad8975f1bf5d 100644
--- a/paddle/fluid/eager/nan_inf_utils.h
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -31,7 +31,8 @@ using TupleOfFourTensors = std::tuple<Tensor, Tensor, Tensor, Tensor>;
 using TupleOfFiveTensors = std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>;
 using TupleOfSixTensors =
     std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>;
-using TupleOfTensorAndVector = std::tuple<Tensor, std::vector<Tensor>>;
+using TupleOfTensorAndVector =
+    std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>>;
 
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
 
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 6da0045443cccdbad2965e9c9b320a41c2015d4d..c0566aaeb2f4644fc09ff78af8c290f31172c2d7 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -40,6 +40,10 @@ class EinsumOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra()
         .AsIntermediate();
 
+    AddOutput("XShape", "(Tensor), The cache of the x_shape of: A and B.")
+        .AsDuplicable()
+        .AsExtra()
+        .AsIntermediate();
     AddAttr<std::string>("equation",
                          "(string) A einsum equation. such as `ij,jk->ik`"
                          "There must have `->` and the number of operands in "
@@ -58,8 +62,8 @@ class EinsumGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto x_name = "Operands";
     auto x_grad_name = framework::GradVarName(x_name);
-    ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim(x_name));
-    ctx->ShareAllLoD(x_name, x_grad_name);
+    ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim("Operands"));
+    ctx->ShareAllLoD("Operands", x_grad_name);
   }
 
  protected:
@@ -78,8 +82,15 @@ class EinsumGradMaker : public framework::SingleGradOpMaker<T> {
 
   void Apply(GradOpPtr<T> retv) const override {
     retv->SetType("einsum_grad");
-    retv->SetInput("Operands", this->Input("Operands"));
-    retv->SetInput("InnerCache", this->Output("InnerCache"));
+    if (this->HasOutput("InnerCache")) {
+      retv->SetInput("InnerCache", this->Output("InnerCache"));
+    }
+    if (this->HasOutput("XShape")) {
+      // add if for compatibility.
+      retv->SetInput("Operands", this->Output("XShape"));  // for memory save.
+    } else {
+      retv->SetInput("Operands", this->Input("Operands"));
+    }
     retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     retv->SetAttrMap(this->Attrs());
     retv->SetOutput(framework::GradVarName("Operands"),
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 980b4219c51dd01dd3af3a48287d65b90083c728..43265cf5e6dcc4b3527e4d30fff2eae5cd7d6cc9 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -402,7 +402,8 @@ void EighInferMeta(const MetaTensor& x,
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out,
-                     std::vector<MetaTensor*> inner_cache) {
+                     std::vector<MetaTensor*> inner_cache,
+                     std::vector<MetaTensor*> xshape) {
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
@@ -439,6 +440,12 @@ void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
   VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape);
   out->set_dims(make_ddim(output_dims));
   out->set_dtype(inputs[0]->dtype());
+  for (size_t i = 0; i < xshape.size(); ++i) {
+    if (xshape[i] != nullptr) {
+      xshape[i]->set_dims(inputs[i]->dims());
+      xshape[i]->set_dtype(inputs[i]->dtype());
+    }
+  }
 }
 
 void ExpandInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index e141acb2ea293c9ad11826ca9865f5af89c1c614..2c21930e03ab2f7a50f6907b352f469b8d21118d 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -83,7 +83,8 @@ void EighInferMeta(const MetaTensor& x,
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out,
-                     std::vector<MetaTensor*> inner_cache);
+                     std::vector<MetaTensor*> inner_cache,
+                     std::vector<MetaTensor*> xshape);
 
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
diff --git a/paddle/phi/kernels/einsum_kernel.h b/paddle/phi/kernels/einsum_kernel.h
index 87df2b1c64a4a993b054b42926fa0508ed5e8a96..569cf7a55afd4a62783e6708c6b2d339d9fabd3c 100644
--- a/paddle/phi/kernels/einsum_kernel.h
+++ b/paddle/phi/kernels/einsum_kernel.h
@@ -29,6 +29,7 @@ void EinsumKernelRaw(const Context& dev_ctx,
                      const std::vector<const DenseTensor*>& inputs,
                      const std::string& equation,
                      DenseTensor* out,
-                     std::vector<DenseTensor*> cache);
+                     std::vector<DenseTensor*> inner_cache,
+                     std::vector<DenseTensor*> xshape);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index a72db326807f8eea865b197e6723924413e29a9b..a04185a0c53ed8a730a71dc230b89b2cb4dfc7a8 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -177,7 +177,6 @@ void EinsumGradKernel(const Context& dev_ctx,
       cache[0].ShareBufferWith(*(inner_cache[0]));
       cache[1].ShareBufferWith(*(inner_cache[1]));
     }
-
     EinsumKernelImpl<T, Context>(dev_ctx,
                                  all_labels,
                                  operands_for_A,
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index bfbd6e0c51cfc7b6592b78335810c75e449fb042..fb0ea2132dbb76bab7cc41e2e13df794d67209df 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -458,7 +458,7 @@ DenseTensor PerformContraction(
     }
     // reduction
     DenseTensor trans_t;
-    if (FLAGS_einsum_opt && use_cache && cache[operand_idx] != nullptr &&
+    if (use_cache && cache[operand_idx] != nullptr &&
         cache[operand_idx]->IsInitialized()) {
       trans_t.ShareBufferWith(*(cache[operand_idx]));
       VLOG(5) << "Cache Used!";
@@ -467,7 +467,7 @@ DenseTensor PerformContraction(
           dev_ctx, t, perm, all_labels, ellipsis, label2type);
       trans_t = PerformTranspose<T, Context>(
           dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
-      if (FLAGS_einsum_opt && cache[operand_idx] != nullptr)
+      if (cache[operand_idx] != nullptr)
         cache[operand_idx]->ShareBufferWith(trans_t);
     }
     auto mul_dims = GetShapeByType<int>(all_labels,
@@ -598,6 +598,11 @@ void EinsumKernelImpl(const Context& dev_ctx,
                                   out);
     // Reshape Procedure
   } else if (inputs.size() == 1) {
+    if (cache[0] != nullptr) {  // For compatibility, may be cache is nullptr if
+                                // loading the program from v2.3.0
+      (*cache[0]) = *(inputs[0]);  // ShareBuffer for backward, because backward
+                                   // we can only see cached tensor.
+    }
     auto reduce_A = PerformReduction<T, Context>(dev_ctx,
                                                  *inputs[0],
                                                  label2perms[0],
@@ -626,7 +631,8 @@ void EinsumKernelRaw(const Context& dev_ctx,
                      const std::vector<const DenseTensor*>& inputs,
                      const std::string& equation,
                      DenseTensor* out,
-                     std::vector<DenseTensor*> cache) {
+                     std::vector<DenseTensor*> cache,
+                     std::vector<DenseTensor*> xshape) {
   std::vector<char> tmp;
   // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output
   // may have nullptr and the cache.size() is not equal to inputs.size(). refer
diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc
index 5e45bcf97ce0e5d79e0fe17be9c8122daa5b60bd..4fd31c1a2d84211ed39ae372a34e54e6971d616a 100644
--- a/paddle/phi/ops/compat/einsum_sig.cc
+++ b/paddle/phi/ops/compat/einsum_sig.cc
@@ -18,7 +18,7 @@ namespace phi {
 
 KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache"});
+      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache", "XShape"});
 }
 
 KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py
index 1a4ae54afefe242c28c4f8198605ed097aeab7e4..4cec7d6373821a032e3ac411bfea30764133d2e1 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py
@@ -37,7 +37,9 @@ class TestEinsumBinary(OpTest):
         self.outputs = {
             'Out': out,
             "InnerCache": [('cache_' + str(i), np.array([1.0]))
-                           for i in range(len(self.operands))]
+                           for i in range(len(self.operands))],
+            "XShape": [('xshape_' + str(i), np.array([1.0]))
+                       for i in range(len(self.operands))],
         }
 
     def init_input(self):
@@ -46,14 +48,13 @@ class TestEinsumBinary(OpTest):
             self.inputs.append(np.random.random(s).astype(t))
 
     def set_mandatory(self):
-        self.disable = False
         self.shapes = [(10, 10, 20), (20, 6)]
         self.types = [np.float64, np.float64]
         self.equation = "mij,jk->ki"
 
     def test_check_output(self):
         if not self.disable:
-            self.check_output(no_check_set=["InnerCache"])
+            self.check_output(no_check_set=["InnerCache", "XShape"])
 
     def test_grad(self):
         if not self.disable:
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 49cc426a00fd998c2ed24f94fb0002e4466065af..a5ad3c37a4f5df3d76ca9ff259b19e426a26e085 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -802,9 +802,10 @@ def gen_einsum_op(equation, *operands):
 
     if _in_legacy_dygraph():
         # dygraph
-        return _C_ops.einsum(operands, len(operands), 'equation', equation)[0]
+        return _C_ops.einsum(operands,
+                             len(operands), len(operands), 'equation',
+                             equation)[0]
 
-    # static graph 
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
     check_type(equation, 'equation', str, 'einsum')
@@ -816,11 +817,16 @@ def gen_einsum_op(equation, *operands):
         helper.create_variable_for_type_inference(dtype=operands[0].dtype)
         for i in range(len(operands))
     ]
+    xshape = [
+        helper.create_variable_for_type_inference(dtype=operands[0].dtype)
+        for i in range(len(operands))
+    ]
     helper.append_op(
         type='einsum',
         inputs={'Operands': operands},
         outputs={'Out': out,
-                 "InnerCache": caches},
+                 "InnerCache": caches,
+                 "XShape": xshape},
         attrs=attrs)
     return out
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 845f6b6ba2fe0fec13a0b94faf53fead2a40a270..6f3351bb6c00c63b03944af120ecd17e0f86e53f 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -547,7 +547,7 @@
 
 - api : einsum
   args : (Tensor[] x, str equation)
-  output : Tensor, Tensor[]{x.size()}
+  output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()}
   infer_meta :
     func : EinsumInferMeta
     param : [x, equation]
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index e16c57d4d898cec376c4bf274f6d3c80271a006a..b8cb1340d26e6b90e1956fd42b3d9b3814ec5cef 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1,3 +1,15 @@
+- backward_api : abs_double_grad
+  forward : abs_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_x_grad)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : abs_double_grad
+  data_transform:
+    skip_transform : grad_x_grad
+
 - backward_api : abs_grad
   forward : abs (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -447,12 +459,12 @@
     skip_transform : out_w, out_w_grad
 
 - backward_api : einsum_grad
-  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache)
-  args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation)
-  output : Tensor[](x_grad){x.size()}
+  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache), Tensor[](x_shape)
+  args : (Tensor[] x_shape, Tensor[] inner_cache, Tensor out_grad, str equation)
+  output : Tensor[](x_grad){x_shape.size()}
   infer_meta :
     func : UnchangedMultiInferMeta
-    param : [x]
+    param : [x_shape]
   kernel :
     func : einsum_grad