diff --git a/paddle/fluid/ir/dialect/pd_op.yaml b/paddle/fluid/ir/dialect/pd_op.yaml
index 5796cea48b13aff752caedf48468e3208afbba53..0fe59e00610fb3a46ed20e0dacd44b1269f894a3 100644
--- a/paddle/fluid/ir/dialect/pd_op.yaml
+++ b/paddle/fluid/ir/dialect/pd_op.yaml
@@ -139,7 +139,7 @@
   - {typename: 'bool', name: print_tensor_shape, default_value: 'true'}
   - {typename: 'bool', name: print_tensor_layout, default_value: 'true'}
   - {typename: 'bool', name: print_tensor_lod, default_value: 'true'}
-  - {typename: 'str', name: print_phase, default_value: 'BOTH'}
+  - {typename: 'str', name: print_phase, default_value: '"BOTH"'}
   - {typename: 'bool', name: is_forward, default_value: 'true'}
   outputs:
     - typename: Tensor
@@ -147,6 +147,17 @@
       optional: false
       no_need_buffer: false
       data_transform: {}
+  infer_meta:
+    func: UnchangedInferMeta
+    param: [in]
+  kernel:
+    func: [print_kernel]
+    param: [in, first_n, message, summarize, print_tensor_name, print_tensor_type, print_tensor_shape, print_tensor_layout, print_tensor_lod, print_phase, is_forward ]
+    backend: null
+    layout: null
+    data_type: null
+    dispatch: {print: null}
+    force_backend: null
   no_need_buffer: null
   data_transform: null
   inplace: null
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
index f85192bada6d3933bea135fe5463d7fa931cbc95..1d479884d85f15137c2d00118f39e442eafb1ec9 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
@@ -270,10 +270,8 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
 
     auto kernel_key =
         GetKernelKey(*it, place, map_value_pair, std::move(op_info_parser));
-    VLOG(6) << "kernel type " << kernel_key;
 
-    // only for single output
-    // need update new kernel key layout and data tyep
+    VLOG(6) << "kernel type " << kernel_key;
 
     std::vector<ir::Type> op_output_types;
     if ((*it)->num_results() > 0) {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 14f850bcbe3d84814c06b70a0897bd834945031a..60efffc107dccdeb5a447c7ae68330320c3520cb 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -162,7 +162,6 @@ if(WITH_XPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
 endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)
 
 
@@ -176,7 +175,6 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
 set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 
-cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS})
 if (WITH_PYTHON)
   cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
 endif()
diff --git a/paddle/fluid/operators/assert_op.cc b/paddle/fluid/operators/assert_op.cc
index acc6853af55b3175e6f8a12a54c04c7119589f2f..2795eb15b55e42f9846f1f974583f6a986a6a140 100644
--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
-#include "paddle/fluid/operators/tensor_formatter.h"
+#include "paddle/phi/kernels/funcs/tensor_formatter.h"
 
 namespace phi {
 class DenseTensor;
@@ -70,7 +70,7 @@ class AssertOp : public framework::OperatorBase {
       return;
     }
 
-    TensorFormatter formatter;
+    funcs::TensorFormatter formatter;
     formatter.SetSummarize(Attr<int64_t>(kSummarize));
 
     const std::vector<std::string> &x_names = Inputs(kData);
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 15a803d83282c17e8ad0e7e9ad573692864810b3..3d014d417bc63aaf166c17eba75fab48d5c89d52 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/tensor_formatter.h"
+#include "paddle/phi/kernels/funcs/tensor_formatter.h"
 
 namespace phi {
 class DenseTensor;
@@ -87,7 +87,7 @@ class PrintOp : public framework::OperatorBase {
     int first_n = Attr<int>("first_n");
     if (first_n > 0 && ++times_ > first_n) return;
 
-    TensorFormatter formatter;
+    funcs::TensorFormatter formatter;
     const std::string &name =
         Attr<bool>("print_tensor_name") ? printed_var_name : "";
     formatter.SetPrintTensorType(Attr<bool>("print_tensor_type"));
diff --git a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
index aaafeb00d0ff5516f26ae096e0a3df99d17d92c7..8ae3248dd92e497364e34833a6121089b042ae26 100644
--- a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
+++ b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
@@ -18,6 +18,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/feed_with_place_impl.h"
 
+#include "paddle/phi/kernels/funcs/tensor_formatter.h"
+
 namespace phi {
 
 template <typename T, typename Context>
@@ -50,5 +52,19 @@ PD_REGISTER_KERNEL(shadow_feed,
                    phi::complex64,
                    phi::complex128) {}
 
+PD_REGISTER_KERNEL(print_kernel,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PrintKernel,
+                   bool,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double,
+                   phi::float16,
+                   phi::bfloat16,
+                   phi::complex64,
+                   phi::complex128) {}
+
 PD_REGISTER_KERNEL(
     shadow_output, CPU, ALL_LAYOUT, phi::ShadowOutputKernel, float) {}
diff --git a/paddle/phi/kernels/feed_with_place_kernel.h b/paddle/phi/kernels/feed_with_place_kernel.h
index 1d173797fd864cc3c9ee732a5f044a80ba00107c..bc7440f22f412c093216b5b3bbcc56ea17f466f8 100644
--- a/paddle/phi/kernels/feed_with_place_kernel.h
+++ b/paddle/phi/kernels/feed_with_place_kernel.h
@@ -35,4 +35,19 @@ void ShadowFeedKernel(const Context& ctx,
                       const DenseTensor& x,
                       DenseTensor* out);
 
+template <typename T, typename Context>
+void PrintKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 int first_n,
+                 const std::string& message,
+                 int summarize,
+                 bool print_tensor_name,
+                 bool print_tensor_type,
+                 bool print_tensor_shape,
+                 bool print_tensor_layout,
+                 bool print_tensor_lod,
+                 const std::string& print_phase,
+                 bool is_forward,
+                 DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/phi/kernels/funcs/tensor_formatter.cc
similarity index 80%
rename from paddle/fluid/operators/tensor_formatter.cc
rename to paddle/phi/kernels/funcs/tensor_formatter.cc
index c04f544c6c348371ef4efc7ae58b01223ecc479f..0b9d4f31d553e3871de9b8ab64e1f8be6c2e1796 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/phi/kernels/funcs/tensor_formatter.cc
@@ -12,14 +12,16 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/tensor_formatter.h"
+#include "paddle/phi/kernels/funcs/tensor_formatter.h"
 
 #include <string>
 
-#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/tensor_utils.h"
 
 namespace paddle {
-namespace operators {
+namespace funcs {
 
 void TensorFormatter::SetPrintTensorType(bool print_tensor_type) {
   print_tensor_type_ = print_tensor_type;
@@ -63,7 +65,7 @@ std::string TensorFormatter::Format(const phi::DenseTensor& print_tensor,
 
   if (print_tensor_lod_) {
     log_stream << "  - lod: {";
-    const framework::LoD& lod = print_tensor.lod();
+    const phi::LoD& lod = print_tensor.lod();
     for (auto level : lod) {
       log_stream << "{";
       bool is_first = true;
@@ -87,29 +89,26 @@ std::string TensorFormatter::Format(const phi::DenseTensor& print_tensor,
   }
 
   if (print_tensor_layout_) {
-    log_stream << "  - layout: "
-               << phi::DataLayoutToString(print_tensor.layout()) << std::endl;
+    log_stream << "  - layout: " << print_tensor.layout() << std::endl;
   }
 
-  std::type_index dtype = framework::ToTypeIndex(
-      framework::TransToProtoVarType(print_tensor.dtype()));
+  auto dtype = print_tensor.dtype();
   if (print_tensor_type_) {
-    log_stream << "  - dtype: " << platform::demangle(dtype.name())
-               << std::endl;
+    log_stream << "  - dtype: " << dtype << std::endl;
   }
 
-  if (framework::IsType<const float>(dtype)) {
+  if (dtype == phi::DataType::FLOAT32) {
     FormatData<float>(print_tensor, log_stream);
-  } else if (framework::IsType<const double>(dtype)) {
+  } else if (dtype == phi::DataType::FLOAT64) {
     FormatData<double>(print_tensor, log_stream);
-  } else if (framework::IsType<const int>(dtype)) {
+  } else if (dtype == phi::DataType::INT32) {
     FormatData<int>(print_tensor, log_stream);
-  } else if (framework::IsType<const int64_t>(dtype)) {
+  } else if (dtype == phi::DataType::INT64) {
     FormatData<int64_t>(print_tensor, log_stream);
-  } else if (framework::IsType<const bool>(dtype)) {
+  } else if (dtype == phi::DataType::BOOL) {
     FormatData<bool>(print_tensor, log_stream);
   } else {
-    log_stream << "  - data: unprintable type: " << dtype.name() << std::endl;
+    log_stream << "  - data: unprintable type: " << dtype << std::endl;
   }
   return log_stream.str();
 }
@@ -122,11 +121,15 @@ void TensorFormatter::FormatData(const phi::DenseTensor& print_tensor,
                            : std::min(summarize_, print_tensor.numel());
   const T* data = nullptr;
   phi::DenseTensor cpu_tensor;
-  if (paddle::platform::is_cpu_place(print_tensor.place())) {
+  if (print_tensor.place().GetType() == phi::AllocationType::CPU) {
     data = print_tensor.data<T>();
   } else {
-    platform::CPUPlace cpu_place;
-    paddle::framework::TensorCopy(print_tensor, cpu_place, &cpu_tensor);
+    phi::CPUPlace cpu_place;
+
+    phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+    auto dev_ctx = pool.Get(print_tensor.place());
+
+    phi::Copy(*dev_ctx, print_tensor, cpu_place, true, &cpu_tensor);
     data = cpu_tensor.data<T>();
   }
 
@@ -151,5 +154,5 @@ template void TensorFormatter::FormatData<int>(
 template void TensorFormatter::FormatData<int64_t>(
     const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
 
-}  // namespace operators
+}  // namespace funcs
 }  // namespace paddle
diff --git a/paddle/fluid/operators/tensor_formatter.h b/paddle/phi/kernels/funcs/tensor_formatter.h
similarity index 92%
rename from paddle/fluid/operators/tensor_formatter.h
rename to paddle/phi/kernels/funcs/tensor_formatter.h
index 87ee84c15588eda03b349f6aff18b45705fd06e7..0914ade6016ff0b5572801bb983cab10be99e4d6 100644
--- a/paddle/fluid/operators/tensor_formatter.h
+++ b/paddle/phi/kernels/funcs/tensor_formatter.h
@@ -15,15 +15,14 @@
 #pragma once
 #include <string>
 
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/var_type.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 class DenseTensor;
 }  // namespace phi
 
 namespace paddle {
-namespace operators {
+namespace funcs {
 
 class TensorFormatter {
  public:
@@ -55,5 +54,5 @@ class TensorFormatter {
   bool print_tensor_layout_ = true;
 };
 
-}  // namespace operators
+}  // namespace funcs
 }  // namespace paddle
diff --git a/paddle/phi/kernels/gpu/feed_with_place_kernel.cu b/paddle/phi/kernels/gpu/feed_with_place_kernel.cu
index f848ff0c2b174256515e753e7334cb07032798c5..e22eb9cd7f02441c8604cb51aabae4bec3d805a5 100644
--- a/paddle/phi/kernels/gpu/feed_with_place_kernel.cu
+++ b/paddle/phi/kernels/gpu/feed_with_place_kernel.cu
@@ -31,3 +31,17 @@ PD_REGISTER_KERNEL(shadow_feed,
                    phi::bfloat16,
                    phi::complex64,
                    phi::complex128) {}
+
+PD_REGISTER_KERNEL(print_kernel,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PrintKernel,
+                   bool,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double,
+                   phi::float16,
+                   phi::bfloat16,
+                   phi::complex64,
+                   phi::complex128) {}
diff --git a/paddle/phi/kernels/impl/feed_with_place_impl.h b/paddle/phi/kernels/impl/feed_with_place_impl.h
index 269c4c886dfe89ae63ef6128d38c1a828de3dfc1..29611a8cfe887c73dc0f3920e2a0680bd2f80f2b 100644
--- a/paddle/phi/kernels/impl/feed_with_place_impl.h
+++ b/paddle/phi/kernels/impl/feed_with_place_impl.h
@@ -16,9 +16,13 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/tensor_formatter.h"
 
 namespace phi {
 
+const char kForward[] = "FORWARD";
+const char kBackward[] = "BACKWARD";
+
 template <typename T, typename Context>
 void ShadowFeedKernel(const Context& ctx,
                       const DenseTensor& x,
@@ -32,4 +36,40 @@ void ShadowFeedKernel(const Context& ctx,
   }
 }
 
+template <typename T, typename Context>
+void PrintKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 int first_n,
+                 const std::string& message,
+                 int summarize,
+                 bool print_tensor_name,
+                 bool print_tensor_type,
+                 bool print_tensor_shape,
+                 bool print_tensor_layout,
+                 bool print_tensor_lod,
+                 const std::string& print_phase,
+                 bool is_forward,
+                 DenseTensor* out) {
+  phi::Copy<Context>(ctx, x, ctx.GetPlace(), true, out);
+  out->set_lod(x.lod());
+
+  if ((is_forward && print_phase == kBackward) ||
+      (!is_forward && print_phase == kForward)) {
+    return;
+  }
+
+  // TODO(phlrain): support first_n using a input tensor
+  // if (first_n > 0 && ++times_ > first_n) return;
+
+  // TODO(phlrain): support printed_var_name
+  paddle::funcs::TensorFormatter formatter;
+  const std::string& name = print_tensor_name ? "var" : "";
+  formatter.SetPrintTensorType(print_tensor_type);
+  formatter.SetPrintTensorShape(print_tensor_shape);
+  formatter.SetPrintTensorLod(print_tensor_lod);
+  formatter.SetPrintTensorLayout(print_tensor_layout);
+  formatter.SetSummarize(summarize);
+  formatter.Print(x, name, message);
+}
+
 }  // namespace phi
diff --git a/test/ir/new_ir/test_standalone_new_ir.py b/test/ir/new_ir/test_standalone_new_ir.py
index d804f8a67dbaf21edae1b8a679475a64fa009913..e954523f6b9aa8e301b7eb848a2efda59f43001e 100644
--- a/test/ir/new_ir/test_standalone_new_ir.py
+++ b/test/ir/new_ir/test_standalone_new_ir.py
@@ -243,6 +243,33 @@ class TestSplitOp(unittest.TestCase):
             np.testing.assert_array_equal(out[0], np_a[0:2])
 
 
+class TestNewIrPrint(unittest.TestCase):
+    def test_with_new_ir(self):
+        paddle.enable_static()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+        exe = paddle.static.Executor(place)
+
+        main_program = paddle.static.Program()
+        new_scope = paddle.static.Scope()
+        with paddle.static.scope_guard(new_scope):
+            with paddle.static.program_guard(main_program):
+                x = paddle.ones([2, 2], dtype="float32")
+                y = paddle.ones([2, 2], dtype="float32")
+
+                z = x + y
+                z = paddle.static.Print(z)
+
+            out = exe.run(main_program, {}, fetch_list=[z.name])
+
+        gold_res = np.ones([2, 2], dtype="float32") * 2
+
+        np.testing.assert_array_equal(out[0], gold_res)
+
+
 class TestJitSaveOp(unittest.TestCase):
     def test_with_new_ir(self):
         paddle.disable_static()