Add StringTensor (#39830)

* add string tensor and case convert kernels * Add strings empty kernel; Reorganize the structure of case convert kernel * Add string infermeta * Update mutable_data of string tensor * rename kernel name * add string copy tmp * Fix strings copy device bug * add utf8 gpu converter * add string tensor c++ api * Remove mutable_data of string tensor * update string tensor interface * remove charcases_flag.h * remove some fluid headers * Add make_ddim * __HIPCC__ -> PADDLE_WITH_HIP * remove fluid headers * fix cpu compile * remove std::hash * Fix cudaMalloc * Remove strings/impl directory * Fix infrt/get_phi_kernel_info.py;Add custom_kernels deps * Add empty kernel test * Remove some comments * Modify lower/upper api encoding type: string->bool * STRING->PSTRING; Add CreateInferLikeMeta * Add code gen for C++ String API * remove strings_api_utils.h * Add ignore file (strings_api.h, strings_api.cc) * update strings gen script * change args order of case convert kernels * Add comments for pstring, StringTensor * cpstring_internal.h -> cpstring_impl.h * Update accordding to comments: 1. Remove fluid headers 2. paddle::platform::errors -> phi::errors 3. Use 'place.GetType() == phi::AllocationType::GPU' instead of 'paddle::platform::is_cpu_space()' 4. Use camel code style * Remove all singletons in strings kernels * fix rocm compile * Fix py3 compile * Fix c++ coverage * 1. Add pstring proto type 2. Add StringTensor debug info 3. Rename case_convert_kernel to strings_lower_upper 4. Remove serialize derialize strings kernel * DataLayout::PSTRING -> DataLayout::PSTRING_UNION * Register pstring data type * Fix strings api gen * Fix dense tensor register pstring dtype * Fix error messages * remove line * add pstring unittest * remove test string api unitest * remove empty line * Remove some headers to decrease the size of executable file

Add StringTensor (#39830)
* add string tensor and case convert kernels * Add strings empty kernel; Reorganize the structure of case convert kernel * Add string infermeta * Update mutable_data of string tensor * rename kernel name * add string copy tmp * Fix strings copy device bug * add utf8 gpu converter * add string tensor c++ api * Remove mutable_data of string tensor * update string tensor interface * remove charcases_flag.h * remove some fluid headers * Add make_ddim * __HIPCC__ -> PADDLE_WITH_HIP * remove fluid headers * fix cpu compile * remove std::hash * Fix cudaMalloc * Remove strings/impl directory * Fix infrt/get_phi_kernel_info.py;Add custom_kernels deps * Add empty kernel test * Remove some comments * Modify lower/upper api encoding type: string->bool * STRING->PSTRING; Add CreateInferLikeMeta * Add code gen for C++ String API * remove strings_api_utils.h * Add ignore file (strings_api.h, strings_api.cc) * update strings gen script * change args order of case convert kernels * Add comments for pstring, StringTensor * cpstring_internal.h -> cpstring_impl.h * Update accordding to comments: 1. Remove fluid headers 2. paddle::platform::errors -> phi::errors 3. Use 'place.GetType() == phi::AllocationType::GPU' instead of 'paddle::platform::is_cpu_space()' 4. Use camel code style * Remove all singletons in strings kernels * fix rocm compile * Fix py3 compile * Fix c++ coverage * 1. Add pstring proto type 2. Add StringTensor debug info 3. Rename case_convert_kernel to strings_lower_upper 4. Remove serialize derialize strings kernel * DataLayout::PSTRING -> DataLayout::PSTRING_UNION * Register pstring data type * Fix strings api gen * Fix dense tensor register pstring dtype * Fix error messages * remove line * add pstring unittest * remove test string api unitest * remove empty line * Remove some headers to decrease the size of executable file
0695e1ac · Jack Zhou · GitHub · 3b895425 · 0695e1ac · 0695e1ac
62 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -9,10 +9,12 @@ paddle/phi/api/backward/backward_api.h
 paddle/phi/api/backward/sparse_bw_api.h
 paddle/phi/api/include/api.h
 paddle/phi/api/include/sparse_api.h
+paddle/phi/api/include/strings_api.h
 paddle/phi/api/lib/api.cc
 paddle/phi/api/lib/dygraph_api.*
 paddle/phi/api/lib/backward_api.cc
 paddle/phi/api/lib/sparse_api.cc
+paddle/phi/api/lib/strings_api.cc
 paddle/phi/api/lib/sparse_bw_api.cc
 paddle/phi/extension.h
 paddle/phi/include/*

--- a/paddle/fluid/framework/convert_utils.cc
+++ b/paddle/fluid/framework/convert_utils.cc
@@ -47,6 +47,8 @@ paddle::experimental::DataType TransToPhiDataType(
      return DataType::BFLOAT16;
    case paddle::framework::proto::VarType::BOOL:
      return DataType::BOOL;
+    case paddle::framework::proto::VarType::PSTRING:
+      return DataType::PSTRING;
    default:
      return DataType::UNDEFINED;
  }
@@ -81,6 +83,8 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
      return paddle::framework::proto::VarType::BF16;
    case DataType::BOOL:
      return paddle::framework::proto::VarType::BOOL;
+    case DataType::PSTRING:
+      return paddle::framework::proto::VarType::PSTRING;
    default:
      PADDLE_THROW(paddle::platform::errors::Unimplemented(
          "Unsupported data type `%s` when casting it into "
@@ -117,6 +121,8 @@ size_t DataTypeSize(DataType dtype) {
      return sizeof(paddle::platform::complex<float>);
    case DataType::COMPLEX128:
      return sizeof(paddle::platform::complex<double>);
+    case DataType::PSTRING:
+      return sizeof(paddle::platform::pstring);
    default:
      return 0;
  }
@@ -145,6 +151,8 @@ DataType String2DataType(const std::string& str) {
    return DataType::COMPLEX64;
  } else if (str == "complex128") {
    return DataType::COMPLEX128;
+  } else if (str == "pstring") {
+    return DataType::PSTRING;
  } else if (str == "bfloat16") {
    return DataType::BFLOAT16;
  } else {
@@ -176,6 +184,8 @@ std::string DataType2String(DataType dtype) {
      return "complex64";
    case DataType::COMPLEX128:
      return "complex128";
+    case DataType::PSTRING:
+      return "pstring";
    case DataType::BFLOAT16:
      return "bfloat16";
    default:

--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -18,9 +18,11 @@

 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/pstring.h"

 using float16 = paddle::platform::float16;
 using bfloat16 = paddle::platform::bfloat16;
+using pstring = phi::dtype::pstring;

 namespace paddle {
 namespace framework {
@@ -58,7 +60,8 @@ static DataTypeMap* InitDataTypeMap() {
  RegisterType<cc_type>(retv, proto_type, #cc_type)

  _ForEachDataType_(RegType);
-
+  // Register pstring individually
+  RegType(pstring, proto::VarType::PSTRING);
 #undef RegType
  return retv;
 }

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -152,6 +152,8 @@ message VarType {
    STRINGS = 26;
    VOCAB = 27;
    FEED_LIST = 28;
+    // The data type of phi::StringTensor
+    PSTRING = 29;
  }

  required Type type = 1;

--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -23,7 +23,7 @@ add_subdirectory(tools)
 add_subdirectory(tests)

 # make an unity target for compile deps
-set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
+set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})


--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
 add_subdirectory(lib)
-cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api)
+cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api strings_api)
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -51,6 +51,14 @@ set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_a
 set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
 set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)

+# strings api file
+set(strings_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api_gen.py)
+set(strings_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml)
+set(strings_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h)
+set(strings_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/strings_api.cc)
+set(strings_api_header_file_tmp ${strings_api_header_file}.tmp)
+set(strings_api_source_file_tmp ${strings_api_source_file}.tmp)
+
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
@@ -114,6 +122,19 @@ add_custom_command(
  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
  VERBATIM)

+# generate strings api
+add_custom_command(
+  OUTPUT ${strings_api_header_file} ${strings_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${strings_api_gen_file}
+                 --api_yaml_path ${strings_api_yaml_file}
+                 --api_header_path ${strings_api_header_file_tmp}
+                 --api_source_path ${strings_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp} ${strings_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp} ${strings_api_source_file}
+  COMMENT "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}"
+  DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base} ${api_gen_file}
+  VERBATIM)
+
 # generate dygraph(intermediate) api
 add_custom_command(
  OUTPUT ${dygraph_api_header_file} ${dygraph_api_source_file}
@@ -152,5 +173,5 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
 cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
 cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api)
-
-cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api)
+cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
+cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api)
--- a/paddle/phi/api/lib/api_declare.h
+++ b/paddle/phi/api/lib/api_declare.h
@@ -19,3 +19,4 @@ limitations under the License. */

 // PD_DECLARE_API(Math);
 // PD_DECLARE_API(SparseApi);
+// PD_DECLARE_API(StringsApi);
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -56,6 +56,10 @@ std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
  return nullptr;
 }

+std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor) {
+  return std::dynamic_pointer_cast<phi::StringTensor>(tensor.impl());
+}
+
 /* ----------------- for infer_meta --------------------- */

 phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
@@ -92,6 +96,10 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
  return {paddle::none};
 }

+phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) {
+  return phi::MetaTensor(tensor);
+}
+
 /* ------------------ for output ----------------------- */

 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
@@ -148,5 +156,20 @@ phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
  return out->impl().get();
 }

+phi::TensorBase* SetStringsKernelOutput(Backend backend,
+                                        Tensor* out,
+                                        TensorType type) {
+  if (!out->initialized()) {
+    if (type == TensorType::STRING_TENSOR) {
+      if (out->impl() == nullptr) {
+        auto strings_tensor = std::make_shared<phi::StringTensor>();
+        out->set_impl(strings_tensor);
+      }
+      return out->impl().get();
+    }
+  }
+  return out->impl().get();
+}
+
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -22,11 +22,12 @@ limitations under the License. */
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/core/string_tensor.h"

 namespace paddle {
 namespace experimental {

-enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO };
+enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO, STRING_TENSOR };

 /* ------------------ for input ----------------------- */

@@ -43,6 +44,8 @@ std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
 std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
    const paddle::optional<Tensor>& tensor);

+std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
+
 /* ----------------- for infer_meta --------------------- */

 phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
@@ -58,6 +61,8 @@ phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
 paddle::optional<phi::MetaTensor> MakeMetaTensor(
    const paddle::optional<const phi::SelectedRows&>& tensor);

+phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor);
+
 /* ------------------ for output ----------------------- */

 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out);
@@ -70,5 +75,9 @@ phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);

 phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);

+phi::TensorBase* SetStringsKernelOutput(Backend backend,
+                                        Tensor* out,
+                                        TensorType type);
+
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
 cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar)
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar string_tensor)
--- a/paddle/phi/common/cpstring_impl.h
+++ b/paddle/phi/common/cpstring_impl.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+   Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+This file is inspired by
+
+    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/ctstring_internal.h
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if (defined(__NVCC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
+
+#if (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+     __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) ||                  \
+    defined(_WIN32)
+#define PD_PSTRING_LITTLE_ENDIAN 1
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define PD_PSTRING_LITTLE_ENDIAN 0
+#else
+#error "Unable to detect endianness."
+#endif
+
+#if defined(__clang__) || \
+    (defined(__GNUC__) && \
+     ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ >= 5))
+HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
+  return __builtin_bswap32(host_int);
+}
+
+#elif defined(_MSC_VER)
+HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
+  return _byteswap_ulong(host_int);
+}
+
+#elif defined(__APPLE__)
+HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
+  return OSSwapInt32(host_int);
+}
+
+#else
+HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
+#if defined(__GLIBC__)
+  return bswap_32(host_int);
+#else   // defined(__GLIBC__)
+  return (((host_int & uint32_t{0xFF}) << 24) |
+          ((host_int & uint32_t{0xFF00}) << 8) |
+          ((host_int & uint32_t{0xFF0000}) >> 8) |
+          ((host_int & uint32_t{0xFF000000}) >> 24));
+#endif  // defined(__GLIBC__)
+}
+#endif
+
+#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__))
+#define PD_le32toh(x) x
+#else  // PD_PSTRING_LITTLE_ENDIAN
+#define PD_le32toh(x) swap32(x)
+#endif  // PD_PSTRING_LARGE_ENDIAN
+
+HOSTDEVICE static inline size_t PD_align16(size_t i) {
+  return (i + 0xF) & ~0xF;
+}
+
+HOSTDEVICE static inline size_t PD_max(size_t a, size_t b) {
+  return a > b ? a : b;
+}
+HOSTDEVICE static inline size_t PD_min(size_t a, size_t b) {
+  return a < b ? a : b;
+}
+
+typedef enum PD_PString_Type {  // NOLINT
+  PD_PSTR_SMALL = 0x00,
+  PD_PSTR_LARGE = 0x01,
+  PD_PSTR_OFFSET = 0x02,
+  PD_PSTR_VIEW = 0x03,
+  PD_PSTR_TYPE_MASK = 0x03
+} PD_PString_Type;
+
+typedef struct PD_PString_Large {  // NOLINT
+  size_t size;
+  size_t cap;
+  char *ptr;
+} PD_PString_Large;
+
+typedef struct PD_PString_Offset {  // NOLINT
+  uint32_t size;
+  uint32_t offset;
+  uint32_t count;
+} PD_PString_Offset;
+
+typedef struct PD_PString_View {  // NOLINT
+  size_t size;
+  const char *ptr;
+} PD_PString_View;
+
+typedef struct PD_PString_Raw {  // NOLINT
+  uint8_t raw[24];
+} PD_PString_Raw;
+
+typedef union PD_PString_Union {  // NOLINT
+  PD_PString_Large large;
+  PD_PString_Offset offset;
+  PD_PString_View view;
+  PD_PString_Raw raw;
+} PD_PString_Union;
+
+enum {
+  PD_PString_SmallCapacity =
+      (sizeof(PD_PString_Union) - sizeof(/* null delim */ char) -
+       sizeof(/* uint8_t size */ uint8_t)),
+};
+
+typedef struct PD_PString_Small {  // NOLINT
+  uint8_t size;
+  char str[PD_PString_SmallCapacity + sizeof(/* null delim */ char)];
+} PD_PString_Small;
+
+typedef struct PD_PString {  // NOLINT
+  union {
+    PD_PString_Small smll;
+    PD_PString_Large large;
+    PD_PString_Offset offset;
+    PD_PString_View view;
+    PD_PString_Raw raw;
+  } u;
+} PD_PString;
+
+HOSTDEVICE static inline PD_PString_Type PD_PString_GetType(
+    const PD_PString *str) {
+  return (PD_PString_Type)(str->u.raw.raw[0] & PD_PSTR_TYPE_MASK);  // NOLINT
+}
+
+HOSTDEVICE static inline size_t PD_PString_ToActualSizeT(size_t size) {
+#if PD_PSTRING_LITTLE_ENDIAN
+  return size >> 2;
+#else   // PD_PSTRING_LITTLE_ENDIAN
+  // 0xFF000000 or 0xFF00000000000000 depending on platform
+  static const size_t mask = ~((~(size_t)0) >> 8);  // NOLINT
+
+  return (((mask << 2) & size) >> 2) | (~mask & size);
+#endif  // PD_PSTRING_LITTLE_ENDIAN
+}
+
+HOSTDEVICE static inline size_t PD_PString_ToInternalSizeT(
+    size_t size, PD_PString_Type type) {
+#if PD_PSTRING_LITTLE_ENDIAN
+  return (size << 2) | type;
+#else   // PD_PSTRING_LITTLE_ENDIAN
+  // 0xFF000000 or 0xFF00000000000000 depending on platform
+  static const size_t mask = ~((~(size_t)0) >> 8);  // NOLINT
+
+  return (mask & (size << 2)) | (~mask & size) |
+         ((size_t)type << ((sizeof(size_t) - 1) * 8));  // NOLINT
+#endif  // PD_PSTRING_LITTLE_ENDIAN
+}
+
+/*
+ * Need to implement in other source file.
+ */
+HOSTDEVICE static inline void PD_Free(void *ptr, size_t size) { free(ptr); }
+
+HOSTDEVICE static inline void *PD_Memset(void *src, int ch, size_t size) {
+  char *dst = (char *)src;  // NOLINT
+  for (size_t i = 0; i < size; ++i) {
+    dst[i] = ch;
+  }
+  return dst;
+}
+
+HOSTDEVICE static inline void *PD_Memcpy(void *dst,
+                                         const void *src,
+                                         size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    ((char *)dst)[i] = ((const char *)src)[i];  // NOLINT
+  }
+  return dst;
+}
+
+HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); }
+
+HOSTDEVICE static inline void *PD_Realloc(void *ptr,
+                                          size_t old_size,
+                                          size_t new_size) {
+#if (defined(__NVCC__) || defined(__HIPCC__))
+  if (old_size >= new_size) {
+    return ptr;
+  }
+  void *new_ptr = malloc(new_size);
+  PD_Memcpy(new_ptr, ptr, old_size);
+  free(ptr);
+  return new_ptr;
+#else
+  return realloc(ptr, new_size);
+#endif
+}
+
+HOSTDEVICE static inline int PD_Memcmp(const void *s1,
+                                       const void *s2,
+                                       size_t size) {
+  const uint8_t *lstr = (const uint8_t *)(s1);  // NOLINT
+  const uint8_t *rstr = (const uint8_t *)(s2);  // NOLINT
+  for (size_t i = 0; i < size; ++i) {
+    if (lstr[i] != rstr[i]) {
+      return (lstr[i] - rstr[i]);
+    }
+  }
+  return 0;
+}
+
+HOSTDEVICE static inline void *PD_Memmove(void *dest,
+                                          const void *src,
+                                          size_t size) {
+  const uint8_t *from = (const uint8_t *)(src);  // NOLINT
+  uint8_t *to = (uint8_t *)(dest);               // NOLINT
+  if (from == to || size == 0) {
+    return dest;
+  }
+
+  if (to > from && (to - from < static_cast<int>(size))) {
+    for (int i = size - 1; i >= 0; i--) {
+      to[i] = from[i];
+    }
+    return dest;
+  }
+  if (from > to && (from - to < static_cast<int>(size))) {
+    for (size_t i = 0; i < size; i++) {
+      to[i] = from[i];
+    }
+    return dest;
+  }
+  dest = PD_Memcpy(dest, src, size);
+  return dest;
+}
+
+HOSTDEVICE static inline void PD_PString_Init(PD_PString *str) {
+  PD_Memset(str->u.raw.raw, 0, sizeof(PD_PString_Raw));
+}
+
+HOSTDEVICE static inline void PD_PString_Dealloc(PD_PString *str) {
+  if (PD_PString_GetType(str) == PD_PSTR_LARGE &&
+      str->u.large.ptr != NULL) {  // NOLINT
+    PD_Free(str->u.large.ptr, str->u.large.cap + 1);
+    PD_PString_Init(str);
+  }
+}
+
+HOSTDEVICE static inline size_t PD_PString_GetSize(const PD_PString *str) {
+  switch (PD_PString_GetType(str)) {
+    case PD_PSTR_SMALL:
+      return str->u.smll.size >> 2;
+    case PD_PSTR_LARGE:
+      return PD_PString_ToActualSizeT(str->u.large.size);
+    case PD_PSTR_OFFSET:
+      return PD_le32toh(str->u.offset.size) >> 2;
+    case PD_PSTR_VIEW:
+      return PD_PString_ToActualSizeT(str->u.view.size);
+    default:
+      return 0;  // Unreachable.
+  }
+}
+
+HOSTDEVICE static inline size_t PD_PString_GetCapacity(const PD_PString *str) {
+  switch (PD_PString_GetType(str)) {
+    case PD_PSTR_SMALL:
+      return PD_PString_SmallCapacity;
+    case PD_PSTR_LARGE:
+      return str->u.large.cap;
+    case PD_PSTR_OFFSET:
+    case PD_PSTR_VIEW:
+    default:
+      return 0;
+  }
+}
+
+HOSTDEVICE static inline const char *PD_PString_GetDataPointer(
+    const PD_PString *str) {
+  switch (PD_PString_GetType(str)) {
+    case PD_PSTR_SMALL:
+      return str->u.smll.str;
+    case PD_PSTR_LARGE:
+      return str->u.large.ptr;
+    case PD_PSTR_OFFSET:
+      return (const char *)str + str->u.offset.offset;  // NOLINT
+    case PD_PSTR_VIEW:
+      return str->u.view.ptr;
+    default:
+      // Unreachable.
+      return NULL;  // NOLINT
+  }
+}
+
+HOSTDEVICE static inline char *PD_PString_ResizeUninitialized(PD_PString *str,
+                                                              size_t new_size) {
+  size_t curr_size = PD_PString_GetSize(str);
+  size_t copy_size = PD_min(new_size, curr_size);
+
+  PD_PString_Type curr_type = PD_PString_GetType(str);
+  const char *curr_ptr = PD_PString_GetDataPointer(str);
+
+  // Case: SMALL/LARGE/VIEW/OFFSET -> SMALL
+  if (new_size <= PD_PString_SmallCapacity) {
+    str->u.smll.size = (uint8_t)((new_size << 2) | PD_PSTR_SMALL);  // NOLINT
+    str->u.smll.str[new_size] = '\0';
+
+    if (curr_type != PD_PSTR_SMALL && copy_size) {
+      PD_Memcpy(str->u.smll.str, curr_ptr, copy_size);
+    }
+
+    if (curr_type == PD_PSTR_LARGE) {
+      PD_Free((void *)curr_ptr, str->u.large.cap + 1);  // NOLINT
+    }
+
+    return str->u.smll.str;
+  }
+
+  // Case: SMALL/LARGE/VIEW/OFFSET -> LARGE
+  size_t new_cap;
+  size_t curr_cap = PD_PString_GetCapacity(str);
+
+  if (new_size < curr_size && new_size < curr_cap / 2) {
+    new_cap = PD_align16(curr_cap / 2 + 1) - 1;
+  } else if (new_size > curr_cap) {
+    new_cap = PD_align16(new_size + 1) - 1;
+  } else {
+    new_cap = curr_cap;
+  }
+
+  char *new_ptr;
+  if (new_cap == curr_cap) {
+    new_ptr = str->u.large.ptr;
+  } else if (curr_type == PD_PSTR_LARGE) {
+    new_ptr = (char *)PD_Realloc(  // NOLINT
+        str->u.large.ptr,
+        curr_cap + 1,
+        new_cap + 1);
+  } else {
+    new_ptr = (char *)PD_Malloc(new_cap + 1);  // NOLINT
+    if (copy_size) {
+      PD_Memcpy(new_ptr, curr_ptr, copy_size);
+    }
+  }
+
+  str->u.large.size = PD_PString_ToInternalSizeT(new_size, PD_PSTR_LARGE);
+  str->u.large.ptr = new_ptr;
+  str->u.large.ptr[new_size] = '\0';
+  str->u.large.cap = new_cap;
+
+  return str->u.large.ptr;
+}
+
+HOSTDEVICE static inline char *PD_PString_GetMutableDataPointer(
+    PD_PString *str) {
+  switch (PD_PString_GetType(str)) {
+    case PD_PSTR_SMALL:
+      return str->u.smll.str;
+    case PD_PSTR_OFFSET:
+    case PD_PSTR_VIEW:
+      // Convert OFFSET/VIEW to SMALL/LARGE
+      PD_PString_ResizeUninitialized(str, PD_PString_GetSize(str));
+      return (PD_PString_GetType(str) == PD_PSTR_SMALL) ? str->u.smll.str
+                                                        : str->u.large.ptr;
+    case PD_PSTR_LARGE:
+      return str->u.large.ptr;
+    default:
+      // Unreachable.
+      return NULL;  // NOLINT
+  }
+}
+
+HOSTDEVICE static inline void PD_PString_Reserve(PD_PString *str,
+                                                 size_t new_cap) {
+  PD_PString_Type curr_type = PD_PString_GetType(str);
+
+  if (new_cap <= PD_PString_SmallCapacity) {
+    // We do nothing, we let Resize/GetMutableDataPointer handle the
+    // conversion to SMALL from VIEW/OFFSET when the need arises.
+    // In the degenerate case, where new_cap <= PD_PString_SmallCapacity,
+    // curr_size > PD_PString_SmallCapacity, and the type is VIEW/OFFSET, we
+    // defer the malloc to Resize/GetMutableDataPointer.
+    return;
+  }
+
+  if (curr_type == PD_PSTR_LARGE && new_cap <= str->u.large.cap) {
+    // We handle reduced cap in resize.
+    return;
+  }
+
+  // Case: VIEW/OFFSET -> LARGE or grow an existing LARGE type
+  size_t curr_size = PD_PString_GetSize(str);
+  const char *curr_ptr = PD_PString_GetDataPointer(str);
+
+  // Since VIEW and OFFSET types are read-only, their capacity is effectively 0.
+  // So we make sure we have enough room in the VIEW and OFFSET cases.
+  new_cap = PD_align16(PD_max(new_cap, curr_size) + 1) - 1;
+  size_t curr_cap = PD_PString_GetCapacity(str);
+
+  if (curr_type == PD_PSTR_LARGE) {
+    str->u.large.ptr = (char *)PD_Realloc(  // NOLINT
+        str->u.large.ptr,
+        curr_cap + 1,
+        new_cap + 1);
+  } else {
+    // Convert to Large
+    char *new_ptr = (char *)PD_Malloc(new_cap + 1);  // NOLINT
+    PD_Memcpy(new_ptr, curr_ptr, curr_size);
+
+    str->u.large.size = PD_PString_ToInternalSizeT(curr_size, PD_PSTR_LARGE);
+    str->u.large.ptr = new_ptr;
+    str->u.large.ptr[curr_size] = '\0';
+  }
+
+  str->u.large.cap = new_cap;
+}
+
+HOSTDEVICE static inline void PD_PString_ReserveAmortized(PD_PString *str,
+                                                          size_t new_cap) {
+  const size_t curr_cap = PD_PString_GetCapacity(str);
+  if (new_cap > curr_cap) {
+    PD_PString_Reserve(str, new_cap > 2 * curr_cap ? new_cap : 2 * curr_cap);
+  }
+}
+
+HOSTDEVICE static inline char *PD_PString_Resize(PD_PString *str,
+                                                 size_t new_size,
+                                                 char c) {
+  size_t curr_size = PD_PString_GetSize(str);
+  char *cstr = PD_PString_ResizeUninitialized(str, new_size);
+
+  if (new_size > curr_size) {
+    PD_Memset(cstr + curr_size, c, new_size - curr_size);
+  }
+
+  return cstr;
+}
+
+HOSTDEVICE static inline void PD_PString_AssignView(PD_PString *dst,
+                                                    const char *src,
+                                                    size_t size) {
+  PD_PString_Dealloc(dst);
+
+  dst->u.view.size = PD_PString_ToInternalSizeT(size, PD_PSTR_VIEW);
+  dst->u.view.ptr = src;
+}
+
+HOSTDEVICE static inline void PD_PString_AppendN(PD_PString *dst,
+                                                 const char *src,
+                                                 size_t src_size) {
+  if (!src_size) return;
+
+  size_t dst_size = PD_PString_GetSize(dst);
+
+  // For append use cases, we want to ensure amortized growth.
+  PD_PString_ReserveAmortized(dst, dst_size + src_size);
+  char *dst_c = PD_PString_ResizeUninitialized(dst, dst_size + src_size);
+
+  PD_Memcpy(dst_c + dst_size, src, src_size);
+}
+
+HOSTDEVICE static inline void PD_PString_Append(PD_PString *dst,
+                                                const PD_PString *src) {
+  const char *src_c = PD_PString_GetDataPointer(src);
+  size_t size = PD_PString_GetSize(src);
+
+  PD_PString_AppendN(dst, src_c, size);
+}
+
+HOSTDEVICE static inline void PD_PString_Copy(PD_PString *dst,
+                                              const char *src,
+                                              size_t size) {
+  char *dst_c = PD_PString_ResizeUninitialized(dst, size);
+
+  if (size) PD_Memcpy(dst_c, src, size);
+}
+
+HOSTDEVICE static inline void PD_PString_Assign(PD_PString *dst,
+                                                const PD_PString *src) {
+  if (dst == src) return;
+
+  PD_PString_Dealloc(dst);
+
+  switch (PD_PString_GetType(src)) {
+    case PD_PSTR_SMALL:
+    case PD_PSTR_VIEW:
+      *dst = *src;
+      return;
+    case PD_PSTR_LARGE: {
+      const char *src_c = PD_PString_GetDataPointer(src);
+      size_t size = PD_PString_GetSize(src);
+
+      PD_PString_Copy(dst, src_c, size);
+    }
+      return;
+    default:
+      return;  // Unreachable.
+  }
+}
+
+HOSTDEVICE static inline void PD_PString_Move(PD_PString *dst,
+                                              PD_PString *src) {
+  if (dst == src) return;
+
+  PD_PString_Dealloc(dst);
+
+  switch (PD_PString_GetType(src)) {
+    case PD_PSTR_SMALL:
+    case PD_PSTR_VIEW:
+      *dst = *src;
+      return;
+    case PD_PSTR_LARGE:
+      *dst = *src;
+      PD_PString_Init(src);
+      return;
+    case PD_PSTR_OFFSET: {
+      const char *src_c = PD_PString_GetDataPointer(src);
+      size_t size = PD_PString_GetSize(src);
+
+      PD_PString_AssignView(dst, src_c, size);
+    }
+      return;
+    default:
+      return;  // Unreachable.
+  }
+}
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"

 #include "paddle/phi/api/ext/exception.h"
+#include "paddle/phi/common/pstring.h"

 namespace paddle {
 namespace experimental {
@@ -27,6 +28,7 @@ using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 using float16 = ::phi::dtype::float16;
 using bfloat16 = ::phi::dtype::bfloat16;
+using pstring = ::phi::dtype::pstring;

 enum class DataType {
  UNDEFINED = 0,
@@ -45,6 +47,11 @@ enum class DataType {
  FLOAT64,
  COMPLEX64,
  COMPLEX128,
+  // In Paddle 2.3, we add a new type of Tensor, StringTensor, which is designed
+  // for string data management. We design the dtype of StringTensor, pstring.
+  // In order to express a unique data dtype of StringTensor, we add
+  // DataType::PSTRING.
+  PSTRING,
  NUM_DATA_TYPES,
  // See Note [ Why we need ALL in baisc kernel key member? ]
  ALL_DTYPE = UNDEFINED,
@@ -72,6 +79,8 @@ inline size_t SizeOf(DataType data_type) {
      return 8;
    case DataType::COMPLEX128:
      return 16;
+    case DataType::PSTRING:
+      return 24;
    case DataType::UNDEFINED:
      return 0;
    case DataType::NUM_DATA_TYPES:
@@ -82,22 +91,23 @@ inline size_t SizeOf(DataType data_type) {
  return 0;
 }

-#define PD_FOR_EACH_DATA_TYPE(_)    \
-  _(bool, DataType::BOOL)           \
-  _(int8_t, DataType::INT8)         \
-  _(uint8_t, DataType::UINT8)       \
-  _(int16_t, DataType::INT16)       \
-  _(uint16_t, DataType::UINT16)     \
-  _(int32_t, DataType::INT32)       \
-  _(uint32_t, DataType::UINT32)     \
-  _(int64_t, DataType::INT64)       \
-  _(uint64_t, DataType::UINT64)     \
-  _(bfloat16, DataType::BFLOAT16)   \
-  _(float16, DataType::FLOAT16)     \
-  _(float, DataType::FLOAT32)       \
-  _(double, DataType::FLOAT64)      \
-  _(complex64, DataType::COMPLEX64) \
-  _(complex128, DataType::COMPLEX128)
+#define PD_FOR_EACH_DATA_TYPE(_)      \
+  _(bool, DataType::BOOL)             \
+  _(int8_t, DataType::INT8)           \
+  _(uint8_t, DataType::UINT8)         \
+  _(int16_t, DataType::INT16)         \
+  _(uint16_t, DataType::UINT16)       \
+  _(int32_t, DataType::INT32)         \
+  _(uint32_t, DataType::UINT32)       \
+  _(int64_t, DataType::INT64)         \
+  _(uint64_t, DataType::UINT64)       \
+  _(bfloat16, DataType::BFLOAT16)     \
+  _(float16, DataType::FLOAT16)       \
+  _(float, DataType::FLOAT32)         \
+  _(double, DataType::FLOAT64)        \
+  _(complex64, DataType::COMPLEX64)   \
+  _(complex128, DataType::COMPLEX128) \
+  _(pstring, DataType::PSTRING)

 template <DataType T>
 struct DataTypeToCppType;
@@ -175,6 +185,9 @@ inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
    case DataType::COMPLEX128:
      os << "complex128";
      break;
+    case DataType::PSTRING:
+      os << "pstring";
+      break;
    default:
      PD_THROW("Invalid enum data type `", static_cast<int>(dtype), "`.");
  }
@@ -195,4 +208,6 @@ using bfloat16 = paddle::experimental::bfloat16;
 using complex64 = paddle::experimental::complex64;
 using complex128 = paddle::experimental::complex128;
 using float16 = paddle::experimental::float16;
+using pstring = paddle::experimental::pstring;
+
 }  // namespace paddle
--- a/paddle/phi/common/layout.h
+++ b/paddle/phi/common/layout.h
@@ -29,6 +29,7 @@ enum class DataLayout {
  MKLDNN,
  SPARSE_COO,
  SPARSE_CSR,
+  PSTRING_UNION,
  NUM_DATA_LAYOUTS,
  NDHWC,
  NCDHW,
@@ -76,6 +77,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
    return DataLayout::SPARSE_CSR;
  } else if (s == "NDHWC") {
    return DataLayout::kNDHWC;
+  } else if (s == "PSTRING_UNION") {
+    return DataLayout::PSTRING_UNION;
  } else if (s == "NCDHW") {
    return DataLayout::kNCDHW;
  } else {
@@ -101,6 +104,8 @@ inline std::string DataLayoutToString(const DataLayout& layout) {
      return "NDHWC";
    case DataLayout::kNCDHW:
      return "NCDHW";
+    case DataLayout::PSTRING_UNION:
+      return "PSTRING_UNION";
    default:
      PD_THROW("Unknown Data Layout type ", static_cast<int>(layout), ".");
  }

--- a/paddle/phi/common/pstring.h
+++ b/paddle/phi/common/pstring.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+   Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+This file is inspired by
+
+    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/tstring.h
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <assert.h>
+
+#include <ostream>
+#include <string>
+
+#include "paddle/phi/common/cpstring_impl.h"
+
+namespace phi {
+namespace dtype {
+
+// Pstring is an only dtype of StringTensor, which is
+// used to manage string data. It provides almost same
+// interfaces compared to std::string, including data(),
+// length() and so on. Besides, pstring data can be
+// manipulated in GPU.
+
+class pstring {
+  PD_PString pstr_;
+
+ public:
+  enum Type {
+    SMALL = PD_PSTR_SMALL,
+    LARGE = PD_PSTR_LARGE,
+    OFFSET = PD_PSTR_OFFSET,
+    VIEW = PD_PSTR_VIEW,
+  };
+
+  typedef const char* const_iterator;
+
+  // Ctor
+  HOSTDEVICE pstring();
+  HOSTDEVICE pstring(const std::string& str);  // NOLINT
+  HOSTDEVICE pstring(const char* str, size_t len);
+  HOSTDEVICE pstring(const char* str);  // NOLINT
+  HOSTDEVICE pstring(size_t n, char c);
+
+  // Copy
+  HOSTDEVICE pstring(const pstring& str);
+
+  // Move
+  HOSTDEVICE pstring(pstring&& str) noexcept;
+
+  // Dtor
+  HOSTDEVICE ~pstring();
+
+  // Copy Assignment
+  HOSTDEVICE pstring& operator=(const pstring& str);
+  HOSTDEVICE pstring& operator=(const std::string& str);
+  HOSTDEVICE pstring& operator=(const char* str);
+  HOSTDEVICE pstring& operator=(char ch);
+
+  // Move Assignment
+  HOSTDEVICE pstring& operator=(pstring&& str);
+
+  // Comparison
+  HOSTDEVICE int compare(const char* str, size_t len) const;
+  HOSTDEVICE bool operator<(const pstring& o) const;
+  HOSTDEVICE bool operator>(const pstring& o) const;
+  HOSTDEVICE bool operator==(const char* str) const;
+  HOSTDEVICE bool operator==(const pstring& o) const;
+  HOSTDEVICE bool operator!=(const char* str) const;
+  HOSTDEVICE bool operator!=(const pstring& o) const;
+
+  // Conversion Operators
+  HOSTDEVICE operator std::string() const;  // NOLINT
+
+  // Attributes
+  HOSTDEVICE size_t size() const;
+  HOSTDEVICE size_t length() const;
+  HOSTDEVICE size_t capacity() const;
+  HOSTDEVICE bool empty() const;
+  HOSTDEVICE Type type() const;
+
+  // Allocation
+  HOSTDEVICE void resize(size_t new_size, char c = 0);
+  // Similar to resize, but will leave the newly grown region uninitialized.
+  HOSTDEVICE void resize_uninitialized(size_t new_size);
+  HOSTDEVICE void clear() noexcept;
+  HOSTDEVICE void reserve(size_t n);
+
+  // Iterators
+  HOSTDEVICE const_iterator begin() const;
+  HOSTDEVICE const_iterator end() const;
+
+  // Const Element Access
+  HOSTDEVICE const char* c_str() const;
+  HOSTDEVICE const char* data() const;
+  HOSTDEVICE const char& operator[](size_t i) const;
+  HOSTDEVICE const char& back() const;
+
+  // Mutable Element Access
+  HOSTDEVICE char* mdata();
+  HOSTDEVICE char& operator[](size_t i);
+
+  // Assignment
+  HOSTDEVICE pstring& assign(const char* str, size_t len);
+  HOSTDEVICE pstring& assign(const char* str);
+
+  // View Assignment
+  HOSTDEVICE pstring& assign_as_view(const pstring& str);
+  HOSTDEVICE pstring& assign_as_view(const std::string& str);
+  HOSTDEVICE pstring& assign_as_view(const char* str, size_t len);
+  HOSTDEVICE pstring& assign_as_view(const char* str);
+
+  // Modifiers
+  // NOTE: Invalid input will result in undefined behavior.
+  HOSTDEVICE pstring& append(const pstring& str);
+  HOSTDEVICE pstring& append(const char* str, size_t len);
+  HOSTDEVICE pstring& append(const char* str);
+  HOSTDEVICE pstring& append(size_t n, char c);
+
+  HOSTDEVICE pstring& erase(size_t pos, size_t len);
+
+  HOSTDEVICE pstring& insert(size_t pos,
+                             const pstring& str,
+                             size_t subpos,
+                             size_t sublen);
+  HOSTDEVICE pstring& insert(size_t pos, size_t n, char c);
+  HOSTDEVICE void swap(pstring& str);
+  HOSTDEVICE void push_back(char ch);
+
+  // Friends
+  HOSTDEVICE friend bool operator==(const char* a, const pstring& b);
+  HOSTDEVICE friend bool operator==(const std::string& a, const pstring& b);
+  HOSTDEVICE friend pstring operator+(const pstring& a, const pstring& b);
+  HOSTDEVICE friend std::ostream& operator<<(std::ostream& o,
+                                             const pstring& str);
+};
+
+// Non-member function overloads
+
+HOSTDEVICE bool operator==(const char* a, const pstring& b);
+HOSTDEVICE bool operator==(const std::string& a, const pstring& b);
+HOSTDEVICE pstring operator+(const pstring& a, const pstring& b);
+HOSTDEVICE std::ostream& operator<<(std::ostream& o, const pstring& str);
+HOSTDEVICE size_t strlen(const char* start);
+
+// Implementations
+
+// Ctor
+
+HOSTDEVICE inline pstring::pstring() { PD_PString_Init(&pstr_); }
+
+HOSTDEVICE inline pstring::pstring(const char* str, size_t len) {
+  PD_PString_Init(&pstr_);
+  PD_PString_Copy(&pstr_, str, len);
+}
+
+HOSTDEVICE inline pstring::pstring(const char* str)
+    : pstring(str, strlen(str)) {}
+
+HOSTDEVICE inline pstring::pstring(size_t n, char c) {
+  PD_PString_Init(&pstr_);
+  PD_PString_Resize(&pstr_, n, c);
+}
+
+HOSTDEVICE inline pstring::pstring(const std::string& str)
+    : pstring(str.data(), str.size()) {}
+
+HOSTDEVICE inline pstring::pstring(const pstring& str) {
+  PD_PString_Init(&pstr_);
+  PD_PString_Assign(&pstr_, &str.pstr_);
+}
+
+// Move
+
+HOSTDEVICE inline pstring::pstring(pstring&& str) noexcept {
+  PD_PString_Init(&pstr_);
+  PD_PString_Move(&pstr_, &str.pstr_);
+}
+
+// Dtor
+
+HOSTDEVICE inline pstring::~pstring() { PD_PString_Dealloc(&pstr_); }
+
+// Copy Assignment
+
+HOSTDEVICE inline pstring& pstring::operator=(const pstring& str) {
+  PD_PString_Assign(&pstr_, &str.pstr_);
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::operator=(const std::string& str) {
+  PD_PString_Copy(&pstr_, str.data(), str.size());
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::operator=(const char* str) {
+  PD_PString_Copy(&pstr_, str, strlen(str));
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::operator=(char c) {
+  resize_uninitialized(1);
+  (*this)[0] = c;
+
+  return *this;
+}
+
+// Move Assignment
+
+HOSTDEVICE inline pstring& pstring::operator=(pstring&& str) {
+  PD_PString_Move(&pstr_, &str.pstr_);
+
+  return *this;
+}
+
+// Comparison
+
+HOSTDEVICE inline int pstring::compare(const char* str, size_t len) const {
+  int ret = PD_Memcmp(data(), str, (len < size()) ? len : size());
+
+  if (ret < 0) return -1;
+  if (ret > 0) return +1;
+
+  if (size() < len) return -1;
+  if (size() > len) return +1;
+
+  return 0;
+}
+
+HOSTDEVICE inline bool pstring::operator<(const pstring& o) const {
+  return compare(o.data(), o.size()) < 0;
+}
+
+HOSTDEVICE inline bool pstring::operator>(const pstring& o) const {
+  return compare(o.data(), o.size()) > 0;
+}
+
+HOSTDEVICE inline bool pstring::operator==(const char* str) const {
+  return strlen(str) == size() && PD_Memcmp(data(), str, size()) == 0;
+}
+
+HOSTDEVICE inline bool pstring::operator==(const pstring& o) const {
+  return o.size() == size() && PD_Memcmp(data(), o.data(), size()) == 0;
+}
+
+HOSTDEVICE inline bool pstring::operator!=(const char* str) const {
+  return !(*this == str);
+}
+
+HOSTDEVICE inline bool pstring::operator!=(const pstring& o) const {
+  return !(*this == o);
+}
+
+// Conversion Operators
+
+HOSTDEVICE inline pstring::operator std::string() const {
+  return std::string(data(), size());
+}
+
+// Attributes
+
+HOSTDEVICE inline size_t pstring::size() const {
+  return PD_PString_GetSize(&pstr_);
+}
+
+HOSTDEVICE inline size_t pstring::length() const { return size(); }
+
+HOSTDEVICE inline size_t pstring::capacity() const {
+  return PD_PString_GetCapacity(&pstr_);
+}
+
+HOSTDEVICE inline bool pstring::empty() const { return size() == 0; }
+
+HOSTDEVICE inline pstring::Type pstring::type() const {
+  return static_cast<pstring::Type>(PD_PString_GetType(&pstr_));
+}
+
+// Allocation
+
+HOSTDEVICE inline void pstring::resize(size_t new_size, char c) {
+  PD_PString_Resize(&pstr_, new_size, c);
+}
+
+HOSTDEVICE inline void pstring::resize_uninitialized(size_t new_size) {
+  PD_PString_ResizeUninitialized(&pstr_, new_size);
+}
+
+HOSTDEVICE inline void pstring::clear() noexcept {
+  PD_PString_ResizeUninitialized(&pstr_, 0);
+}
+
+HOSTDEVICE inline void pstring::reserve(size_t n) {
+  PD_PString_Reserve(&pstr_, n);
+}
+
+// Iterators
+
+HOSTDEVICE inline pstring::const_iterator pstring::begin() const {
+  return &(*this)[0];
+}
+HOSTDEVICE inline pstring::const_iterator pstring::end() const {
+  return &(*this)[size()];
+}
+
+// Element Access
+
+HOSTDEVICE inline const char* pstring::c_str() const { return data(); }
+
+HOSTDEVICE inline const char* pstring::data() const {
+  return PD_PString_GetDataPointer(&pstr_);
+}
+
+HOSTDEVICE inline const char& pstring::operator[](size_t i) const {
+  return data()[i];
+}
+
+HOSTDEVICE inline const char& pstring::back() const {
+  return (*this)[size() - 1];
+}
+
+HOSTDEVICE inline char* pstring::mdata() {
+  return PD_PString_GetMutableDataPointer(&pstr_);
+}
+
+HOSTDEVICE inline char& pstring::operator[](size_t i) { return mdata()[i]; }
+
+// Assignment
+
+HOSTDEVICE inline pstring& pstring::assign(const char* str, size_t len) {
+  PD_PString_Copy(&pstr_, str, len);
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::assign(const char* str) {
+  assign(str, strlen(str));
+
+  return *this;
+}
+
+// View Assignment
+
+HOSTDEVICE inline pstring& pstring::assign_as_view(const pstring& str) {
+  assign_as_view(str.data(), str.size());
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::assign_as_view(const std::string& str) {
+  assign_as_view(str.data(), str.size());
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::assign_as_view(const char* str,
+                                                   size_t len) {
+  PD_PString_AssignView(&pstr_, str, len);
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::assign_as_view(const char* str) {
+  assign_as_view(str, strlen(str));
+
+  return *this;
+}
+
+// Modifiers
+
+HOSTDEVICE inline pstring& pstring::append(const pstring& str) {
+  PD_PString_Append(&pstr_, &str.pstr_);
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::append(const char* str, size_t len) {
+  PD_PString_AppendN(&pstr_, str, len);
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::append(const char* str) {
+  append(str, strlen(str));
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::append(size_t n, char c) {
+  // For append use cases, we want to ensure amortized growth.
+  const size_t new_size = size() + n;
+  PD_PString_ReserveAmortized(&pstr_, new_size);
+  resize(new_size, c);
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::erase(size_t pos, size_t len) {
+  PD_Memmove(mdata() + pos, data() + pos + len, size() - len - pos);
+
+  resize(size() - len);
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::insert(size_t pos,
+                                           const pstring& str,
+                                           size_t subpos,
+                                           size_t sublen) {
+  size_t orig_size = size();
+  PD_PString_ResizeUninitialized(&pstr_, orig_size + sublen);
+
+  PD_Memmove(mdata() + pos + sublen, data() + pos, orig_size - pos);
+  PD_Memmove(mdata() + pos, str.data() + subpos, sublen);
+
+  return *this;
+}
+
+HOSTDEVICE inline pstring& pstring::insert(size_t pos, size_t n, char c) {
+  size_t size_ = size();
+  PD_PString_ResizeUninitialized(&pstr_, size_ + n);
+
+  PD_Memmove(mdata() + pos + n, data() + pos, size_ - pos);
+  PD_Memset(mdata() + pos, c, n);
+
+  return *this;
+}
+
+HOSTDEVICE inline void pstring::swap(pstring& str) {
+  std::swap(pstr_, str.pstr_);
+}
+
+HOSTDEVICE inline void pstring::push_back(char ch) { append(1, ch); }
+
+// Friends
+
+HOSTDEVICE inline bool operator==(const char* a, const pstring& b) {
+  return strlen(a) == b.size() && PD_Memcmp(a, b.data(), b.size()) == 0;
+}
+
+HOSTDEVICE inline bool operator==(const std::string& a, const pstring& b) {
+  return a.size() == b.size() && PD_Memcmp(a.data(), b.data(), b.size()) == 0;
+}
+
+HOSTDEVICE inline pstring operator+(const pstring& a, const pstring& b) {
+  pstring r;
+  r.reserve(a.size() + b.size());
+  r.append(a);
+  r.append(b);
+
+  return r;
+}
+
+HOSTDEVICE inline std::ostream& operator<<(std::ostream& o,
+                                           const pstring& str) {
+  return o.write(str.data(), str.size());
+}
+
+HOSTDEVICE inline size_t strlen(const char* start) {
+  const char* end = start;
+  for (; *end != '\0'; ++end) {
+  }
+  return end - start;
+}
+
+}  // namespace dtype
+}  // namespace phi
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -19,6 +19,7 @@ cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce)
 cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base)
 cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
 cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)
+cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta tensor_base)

 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)

--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -16,6 +16,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/string_tensor.h"

 namespace phi {
 using DataType = paddle::experimental::DataType;
@@ -167,6 +168,8 @@ struct DeviceContext::Impl {
      static_cast<DenseTensor*>(tensor)->clear();
    } else if (SelectedRows::classof(tensor)) {
      static_cast<SelectedRows*>(tensor)->mutable_value()->clear();
+    } else if (StringTensor::classof(tensor)) {
+      static_cast<StringTensor*>(tensor)->clear();
    } else {
      PADDLE_THROW(errors::Unimplemented(
          "Only support DenseTensor and SelectedRows now."));
@@ -262,6 +265,7 @@ DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::bfloat16)
 DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::float16)
 DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64)
 DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::pstring)

 #undef DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION


--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -211,6 +211,14 @@ struct KernelRegistrar {
          dtype == static_cast<size_t>(DataType::UINT16)) {
        continue;
      }
+      // NOTE(zhoushunjie): Only the strings kernels can support pstring dtype
+      constexpr char strings_kernels_prefix[] = "strings_";
+      if (dtype == static_cast<size_t>(DataType::PSTRING) &&
+          strncmp(kernel_name_cstr,
+                  strings_kernels_prefix,
+                  strlen(strings_kernels_prefix))) {
+        continue;
+      }
      ConstructKernel(reg_type,
                      kernel_name_cstr,
                      backend_cstr,

--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -26,6 +26,7 @@
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/core/type_defs.h"

 namespace phi {
@@ -232,6 +233,10 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);

+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(StringTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(StringTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(StringTensor);
+
  /* Attribute Helpers */

  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
@@ -266,6 +271,8 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);

+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(StringTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(StringTensor);
  /* End case */
  template <typename T>
  struct KernelCallHelper<TypeTag<T>> {

--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/core/string_tensor_utils.h"
 #include "paddle/phi/core/tensor_utils.h"

 namespace phi {
@@ -33,6 +35,9 @@ void MetaTensor::set_dims(const DDim& dims) {
  if (phi::DenseTensor::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->dims =
        dims;
+  } else if (phi::StringTensor::classof(tensor_)) {
+    StringTensorUtils::GetMutableMeta(static_cast<StringTensor*>(tensor_))
+        ->dims = dims;
  } else if (phi::SelectedRows::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(
        static_cast<SelectedRows*>(tensor_)->mutable_value())
@@ -47,6 +52,8 @@ void MetaTensor::set_dtype(DataType dtype) {
  if (phi::DenseTensor::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
        ->dtype = dtype;
+  } else if (phi::StringTensor::classof(tensor_)) {
+    // No need to set dtype
  } else if (phi::SelectedRows::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(
        static_cast<SelectedRows*>(tensor_)->mutable_value())
@@ -61,6 +68,8 @@ void MetaTensor::set_layout(DataLayout layout) {
  if (phi::DenseTensor::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
        ->layout = layout;
+  } else if (phi::StringTensor::classof(tensor_)) {
+    // No need to set layout
  } else if (phi::SelectedRows::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(
        static_cast<SelectedRows*>(tensor_)->mutable_value())

--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/string_tensor.h"
+
+namespace phi {
+
+StringTensor::StringTensor() { meta_.offset = 0; }
+
+StringTensor::StringTensor(Allocator* a, const StringTensorMeta& meta)
+    : meta_(meta), holder_(a->Allocate(SizeOf(dtype()) * numel())) {
+  init_holder();
+}
+
+StringTensor::StringTensor(Allocator* a, StringTensorMeta&& meta)
+    : meta_(std::move(meta)), holder_(a->Allocate(SizeOf(dtype()) * numel())) {
+  init_holder();
+}
+
+StringTensor::StringTensor(const std::shared_ptr<phi::Allocation>& holder,
+                           const StringTensorMeta& meta)
+    : meta_(meta), holder_(holder) {}
+
+StringTensor::StringTensor(const StringTensor& other) : meta_(other.meta()) {
+  holder_ = other.holder_;
+}
+
+StringTensor& StringTensor::operator=(const StringTensor& other) {
+  meta_ = other.meta();
+  holder_ = other.holder_;
+  return *this;
+}
+
+StringTensor& StringTensor::operator=(StringTensor&& other) {
+  meta_ = std::move(other.meta_);
+  std::swap(holder_, other.holder_);
+  return *this;
+}
+
+int64_t StringTensor::numel() const {
+  if (meta_.is_scalar) {
+    return 1;
+  }
+  return product(meta_.dims);
+}
+
+bool StringTensor::IsSharedWith(const StringTensor& b) const {
+  return holder_ && holder_ == b.holder_;
+}
+
+const Place& StringTensor::place() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_,
+      phi::errors::PreconditionNotMet(
+          "Tensor not initialized yet when DenseTensor::place() is called."));
+  return holder_->place();
+}
+
+const dtype::pstring* StringTensor::data() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_,
+      phi::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  return reinterpret_cast<const dtype::pstring*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + meta_.offset);
+}
+
+dtype::pstring* StringTensor::data() {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_,
+      phi::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  return reinterpret_cast<dtype::pstring*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + meta_.offset);
+}
+
+void StringTensor::set_meta(const StringTensorMeta& meta) {
+  PADDLE_ENFORCE(
+      meta.valid(),
+      phi::errors::InvalidArgument(
+          "Input meta is invalid, please check the meta attribute."));
+  meta_.dims = meta.dims;
+  meta_.is_scalar = meta.is_scalar;
+  meta_.offset = meta.offset;
+}
+
+StringTensor& StringTensor::Resize(const DDim& dims) {
+  meta_.dims = dims;
+  return *this;
+}
+// TODO(zhoushunjie): need to remove it for general space
+void StringTensor::init_holder() {
+  void* ptr = holder_->ptr();
+  auto& place = holder_->place();
+  auto bytes_size = holder_->size();
+  VLOG(6) << "Init StringTensor data with bytes:" << bytes_size;
+  if (place.GetType() == phi::AllocationType::CPU) {
+    std::memset(ptr, 0, bytes_size);
+  } else if (place.GetType() == phi::AllocationType::GPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
+    hipMemset(ptr, 0, bytes_size);
+#else
+    cudaMemset(ptr, 0, bytes_size);
+#endif
+#endif
+  } else {
+    // TODO(zhoushunjie): Need to support more places
+    PADDLE_THROW(
+        errors::Unimplemented("StringTensor can only be created in CPU or GPU "
+                              "place. But now attemps to "
+                              "create StringTensor on %s",
+                              place.DebugString()));
+  }
+}
+
+void* StringTensor::AllocateFrom(Allocator* allocator,
+                                 DataType dtype,
+                                 size_t requested_size) {
+  PADDLE_ENFORCE_NOT_NULL(
+      allocator,
+      errors::InvalidArgument(
+          "Required allocator shall not be nullptr, but received nullptr."));
+  PADDLE_ENFORCE(
+      valid(),
+      errors::PreconditionNotMet(
+          "The meta data must be valid when call the mutable data function."));
+  size_t bytes = numel() * SizeOf(this->dtype());
+  if (requested_size) {
+    PADDLE_ENFORCE_GE(requested_size,
+                      bytes,
+                      errors::InvalidArgument(
+                          "The reserved size %d should be enough to meet the "
+                          "volume required by metadata %d.",
+                          requested_size,
+                          bytes));
+    bytes = requested_size;
+  }
+
+  if (!holder_ || holder_->size() < bytes + meta_.offset) {
+    meta_.offset = 0;
+    VLOG(10) << "Allocate string data with bytes: " << bytes;
+    holder_.reset();
+    holder_ = allocator->Allocate(bytes);
+    // Initialize the allocated bytes
+    init_holder();
+    meta_.offset = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 meta_.offset);
+}
+
+}  // namespace phi
--- a/paddle/phi/core/string_tensor.h
+++ b/paddle/phi/core/string_tensor.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/storage.h"
+#include "paddle/phi/core/tensor_base.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+namespace phi {
+
+/// \brief In Paddle 2.3, we add a new type of Tensor, StringTensor,
+/// which is designed for string data management.
+/// During the entire life cycle of a StringTensor, its device type and key
+/// metadata are set unchanged.
+class StringTensorUtils;
+
+class StringTensor : public TensorBase,
+                     public TypeInfoTraits<TensorBase, StringTensor> {
+ public:
+  /// \brief Construct a string tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of string tensor.
+  StringTensor(Allocator* a, const StringTensorMeta& meta);
+
+  /// \brief Construct a string tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of string tensor.
+  StringTensor(Allocator* a, StringTensorMeta&& meta);
+
+  StringTensor(const std::shared_ptr<phi::Allocation>& holder,
+               const StringTensorMeta& meta);
+
+  /// \brief Because string tensor is a resource handle, we provide a default
+  /// move constructor to support move semantics.
+  StringTensor(StringTensor&& other) = default;
+
+  StringTensor(const StringTensor& other);
+
+  StringTensor();
+  /// \brief StringTensor shallow copy assignment.
+  StringTensor& operator=(const StringTensor& other);
+
+  StringTensor& operator=(StringTensor&& other);
+  /// \brief Destroy the tensor object and release exclusive resources.
+  virtual ~StringTensor() = default;
+
+ public:
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() { return "StringTensor"; }
+
+  /// \brief Returns the number of elements contained in tensor.
+  /// \return The number of elements contained in tensor.
+  int64_t numel() const override;
+
+  /// \brief Returns the dims of the tensor.
+  /// \return The dims of the tensor.
+  const DDim& dims() const noexcept override { return meta_.dims; }
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  const Place& place() const override;
+
+  /// \brief Returns the meta information of the tensor.
+  /// \return The meta information of the tensor.
+  const StringTensorMeta& meta() const noexcept { return meta_; }
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  DataType dtype() const noexcept override { return DataType::PSTRING; }
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  DataLayout layout() const noexcept override {
+    return DataLayout::PSTRING_UNION;
+  }
+
+  void set_meta(const StringTensorMeta& meta);
+
+  /// \brief Test whether the metadata is valid.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept override { return meta_.valid(); }
+
+  /// \brief Test whether the storage is allocated.
+  /// return Whether the storage is allocated.
+  bool initialized() const override { return holder_ && holder_->ptr(); }
+
+  /// \brief Check if storage is shared with other objects.
+  /// \return Whether the storage is shared with other objects.
+  bool IsSharedWith(const StringTensor& b) const;
+
+  StringTensor& Resize(const DDim& dims);
+
+  /// \brief Returns the actual storage size occupied by tensor, may be larger
+  /// than its shape dims.
+  /// \return The actual storage size occupied by tensor.
+  size_t capacity() const { return holder_->size(); }
+
+  /// \brief Get the const data pointer value of pstring type.
+  /// \return The const data pointer value of pstring type.
+  const dtype::pstring* data() const;
+  dtype::pstring* data();
+
+  void clear() {
+    holder_.reset();
+    meta_.offset = 0;
+  }
+  void* AllocateFrom(Allocator* allocator,
+                     DataType dtype,
+                     size_t requested_size = 0);
+
+ private:
+  friend class StringTensorUtils;
+
+ private:
+  StringTensorMeta meta_;
+  std::shared_ptr<phi::Allocation> holder_;
+  void init_holder();
+};
+
+}  // namespace phi
--- a/paddle/phi/core/string_tensor_utils.h
+++ b/paddle/phi/core/string_tensor_utils.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+namespace phi {
+class StringTensorUtils {
+ public:
+  static StringTensorMeta* GetMutableMeta(StringTensor* tensor) {
+    return &(tensor->meta_);
+  }
+};
+
+}  // namespace phi
--- a/paddle/phi/core/tensor_meta.cc
+++ b/paddle/phi/core/tensor_meta.cc
@@ -40,4 +40,12 @@ bool DenseTensorMeta::valid() const noexcept {
  return valid;
 }

+StringTensorMeta::StringTensorMeta(const DDim& dims) : dims(dims) {}
+
+bool StringTensorMeta::valid() const noexcept {
+  bool valid{true};
+  valid = valid && (is_scalar || product(dims) >= 0);
+  return valid;
+}
+
 }  // namespace phi
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -64,4 +64,24 @@ inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) {
         (lhs.lod == rhs.lod) && (lhs.offset == rhs.offset);
 }

+struct StringTensorMeta {
+  StringTensorMeta() = default;
+  explicit StringTensorMeta(const DDim& dims);
+  /// \brief Test whether the metadata is valid. Does not throw exceptions.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept;
+
+  /// During the entire life cycle of a DenseTensor, the following attributes
+  /// marked with `const` are expected to remain unchanged.
+  bool is_scalar{false};
+  DDim dims;
+  size_t offset{0};
+};
+
+inline bool operator==(const StringTensorMeta& lhs,
+                       const StringTensorMeta& rhs) {
+  return (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) &&
+         (lhs.offset == rhs.offset);
+}
+
 }  // namespace phi
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
 cc_library(infermeta SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils)
 cc_library(backward_infermeta SRCS backward.cc DEPS meta_tensor convert_utils)
+add_subdirectory(strings)
--- a/paddle/phi/infermeta/strings/CMakeLists.txt
+++ b/paddle/phi/infermeta/strings/CMakeLists.txt
+cc_library(string_infermeta SRCS nullary.cc unary.cc DEPS convert_utils infermeta_utils)
--- a/paddle/phi/infermeta/strings/nullary.cc
+++ b/paddle/phi/infermeta/strings/nullary.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/infermeta/strings/nullary.h"
+
+namespace phi {
+namespace strings {
+
+void CreateInferMeta(const ScalarArray& shape, MetaTensor* out) {
+  const auto& out_dims = phi::make_ddim(shape.GetData());
+  out->set_dims(out_dims);
+  out->set_dtype(DataType::PSTRING);
+  out->set_layout(DataLayout::PSTRING_UNION);
+}
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/infermeta/strings/nullary.h
+++ b/paddle/phi/infermeta/strings/nullary.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+namespace phi {
+namespace strings {
+
+void CreateInferMeta(const std::vector<int64_t>& shape, MetaTensor* out);
+void CreateInferMeta(const ScalarArray& shape, MetaTensor* out);
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/infermeta/strings/unary.cc
+++ b/paddle/phi/infermeta/strings/unary.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/strings/unary.h"
+
+#include "paddle/phi/core/infermeta_utils.h"
+
+namespace phi {
+namespace strings {
+
+void UnchangedInferMeta(const StringTensorMeta& x_meta, MetaTensor* out) {
+  out->set_dims(x_meta.dims);
+  out->set_dtype(DataType::PSTRING);
+  out->set_layout(DataLayout::PSTRING_UNION);
+}
+
+void CreateLikeInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/infermeta/strings/unary.h
+++ b/paddle/phi/infermeta/strings/unary.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+namespace phi {
+namespace strings {
+// Common InferMeta Functions of StringTensor for unary operators:
+void UnchangedInferMeta(const StringTensorMeta& x_meta, MetaTensor* out);
+
+void CreateLikeInferMeta(const MetaTensor& x, MetaTensor* out);
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -68,6 +68,8 @@ add_subdirectory(sparse)
 add_subdirectory(selected_rows)

 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
+# For strings kernels
+add_subdirectory(strings)

 # 5. kernel autotune 
 add_subdirectory(autotune)
--- a/paddle/phi/kernels/strings/CMakeLists.txt
+++ b/paddle/phi/kernels/strings/CMakeLists.txt
+add_subdirectory(cpu)
+if(WITH_GPU OR WITH_ROCM)
+  add_subdirectory(gpu)
+endif()
+
+cc_library(unicode SRCS unicode.cc DEPS utf8proc)
+set_property(GLOBAL PROPERTY STRING_KERNELS "")
+
+set(STRING_KERNEL_DEPS dense_tensor string_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
+set(STRING_KERNEL_DEPS ${STRING_KERNEL_DEPS} eigen_function blas math_function)
+# remove this dep after removing fluid deps on tensor creation
+set(STRING_KERNEL_DEPS ${STRING_KERNEL_DEPS} phi_api_utils)
+set(STRING_KERNEL_DEPS ${STRING_KERNEL_DEPS} string_infermeta)
+set(STRING_KERNEL_DEPS ${STRING_KERNEL_DEPS} unicode)
+
+register_kernels(DEPS ${STRING_KERNEL_DEPS} SUB_DIR "strings")
--- a/paddle/phi/kernels/strings/case_utils.h
+++ b/paddle/phi/kernels/strings/case_utils.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/kernels/strings/unicode.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif
+
+namespace phi {
+namespace strings {
+
+using pstring = dtype::pstring;
+struct AsciiToLower {
+  HOSTDEVICE char operator()(char in) const {
+    return ('A' <= in && in <= 'Z') ? in - ('Z' - 'z') : in;
+  }
+};
+
+struct AsciiToUpper {
+  HOSTDEVICE char operator()(char in) const {
+    return ('a' <= in && in <= 'z') ? in ^ 0x20 : in;
+  }
+};
+
+template <typename Context>
+struct UTF8ToLower {
+  HOSTDEVICE UTF8ToLower(const uint8_t* unicode_flag_map,
+                         const uint16_t* cases_map)
+      : unicode_flag_map_(unicode_flag_map), cases_map_(cases_map) {}
+
+  HOSTDEVICE uint32_t operator()(uint32_t in) const {
+    uint32_t flg = (in <= 0x00FFFF ? unicode_flag_map_[in] : 0);
+    return (strings::IsUpper(flg) ? cases_map_[in] : in);
+  }
+
+  const uint8_t* unicode_flag_map_;
+  const uint16_t* cases_map_;
+};
+
+template <typename Context>
+struct UTF8ToUpper {
+  HOSTDEVICE UTF8ToUpper(const uint8_t* unicode_flag_map,
+                         const uint16_t* cases_map)
+      : unicode_flag_map_(unicode_flag_map), cases_map_(cases_map) {}
+
+  HOSTDEVICE uint32_t operator()(uint32_t in) const {
+    uint32_t flg = (in <= 0x00FFFF ? unicode_flag_map_[in] : 0);
+    return (strings::IsLower(flg) ? cases_map_[in] : in);
+  }
+
+  const uint8_t* unicode_flag_map_;
+  const uint16_t* cases_map_;
+};
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/kernels/strings/cpu/CMakeLists.txt
+++ b/paddle/phi/kernels/strings/cpu/CMakeLists.txt
--- a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace strings {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const StringTensor& src,
+          bool blocking,
+          StringTensor* dst) {
+  auto* src_ptr = src.data();
+  const auto& src_place = src.place();
+
+  VLOG(3) << "StringTensorCopy " << src.dims() << " from " << src.place()
+          << " to " << src_place;
+
+  dst->Resize(src.dims());
+  dtype::pstring* dst_ptr = dev_ctx.template Alloc<dtype::pstring>(dst);
+
+  if (src_ptr == dst_ptr) {
+    VLOG(3) << "Skip copy the same string data async from " << src_place
+            << " to " << src_place;
+    return;
+  }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+  int64_t numel = src.numel();
+
+  if (src_place.GetType() == phi::AllocationType::CPU) {
+    for (int64_t i = 0; i < numel; ++i) {
+      dst_ptr[i] = src_ptr[i];
+    }
+  }
+}
+
+}  // namespace strings
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(strings_copy,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::strings::Copy<phi::CPUContext>,
+                           pstring) {}
--- a/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using pstring = ::phi::dtype::pstring;
+
+namespace phi {
+namespace strings {
+
+template <typename ContextT>
+void StringLowerKernel(const ContextT& dev_ctx,
+                       const StringTensor& x,
+                       bool use_utf8_encoding,
+                       StringTensor* out) {
+  StringCaseConvertKernel<AsciiCaseConverter<ContextT, AsciiToLower>,
+                          UTF8CaseConverter<ContextT, UTF8ToLower>,
+                          ContextT>()(dev_ctx, x, use_utf8_encoding, out);
+}
+
+template <typename ContextT>
+void StringUpperKernel(const ContextT& dev_ctx,
+                       const StringTensor& x,
+                       bool use_utf8_encoding,
+                       StringTensor* out) {
+  StringCaseConvertKernel<AsciiCaseConverter<ContextT, AsciiToUpper>,
+                          UTF8CaseConverter<ContextT, UTF8ToUpper>,
+                          ContextT>()(dev_ctx, x, use_utf8_encoding, out);
+}
+
+}  // namespace strings
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(strings_lower,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::strings::StringLowerKernel<phi::CPUContext>,
+                           pstring) {}
+
+PD_REGISTER_GENERAL_KERNEL(strings_upper,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::strings::StringUpperKernel<phi::CPUContext>,
+                           pstring) {}
--- a/paddle/phi/kernels/strings/gpu/CMakeLists.txt
+++ b/paddle/phi/kernels/strings/gpu/CMakeLists.txt
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/string_tensor.h"
+
+namespace phi {
+namespace strings {
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+__global__ void SerializeStringsData(const phi::dtype::pstring* src_str,
+                                     uint8_t* strings_data,
+                                     int32_t* strings_offset,
+                                     int64_t numel,
+                                     int32_t start_offset) {
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    strings_offset[0] = start_offset;
+    for (int64_t i = 1; i <= numel; ++i) {
+      strings_offset[i] = strings_offset[i - 1] + src_str[i - 1].length() + 1;
+    }
+  }
+  __syncthreads();
+  CUDA_KERNEL_LOOP(i, numel) {
+    memcpy(strings_data + strings_offset[i],
+           src_str[i].data(),
+           src_str[i].length() + 1);
+  }
+}
+
+__global__ void SumStringsLen(const phi::dtype::pstring* src_ptr,
+                              int64_t numel,
+                              int* num) {
+  extern __shared__ int counter[];
+  int thread_counter = 0;
+  CUDA_KERNEL_LOOP(i, numel) { thread_counter += src_ptr[i].length() + 1; }
+  counter[threadIdx.x] = thread_counter;
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    int block_counter = 0;
+    for (int i = 0; i < blockDim.x; ++i) {
+      block_counter += counter[i];
+    }
+    atomicAdd(num, block_counter);
+  }
+}
+
+template <typename Context>
+int GetAllStringsSize(const Context& dev_ctx,
+                      const phi::dtype::pstring* src_ptr,
+                      size_t numel) {
+  auto nums_meta =
+      phi::DenseTensorMeta(DataType::INT32, {1}, phi::DataLayout::NCHW);
+  DenseTensor nums_tensor = phi::Empty(dev_ctx, std::move(nums_meta));
+
+  int* nums_ptr = dev_ctx.template Alloc<int>(&nums_tensor);
+  phi::backends::gpu::GpuMemsetAsync(
+      nums_ptr, 0, sizeof(int), dev_ctx.stream());
+
+  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+  dim3 grid_size =
+      dim3((numel + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+  SumStringsLen<<<grid_size,
+                  block_size,
+                  PREDEFINED_BLOCK_SIZE * sizeof(int),
+                  dev_ctx.stream()>>>(src_ptr, numel, nums_ptr);
+  int num = -1;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &num, nums_ptr, sizeof(int), hipMemcpyDeviceToHost, dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &num, nums_ptr, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream());
+#endif
+  return num;
+}
+
+__global__ void DeserializeCUDAKernel(const char* strings_data,
+                                      const int* strings_offset,
+                                      phi::dtype::pstring* dst_str,
+                                      int numel) {
+  CUDA_KERNEL_LOOP(i, numel) {
+    // -1 not include '\0'
+    auto len = strings_offset[i + 1] - strings_offset[i] - 1;
+    dst_str[i] = phi::dtype::pstring(strings_data + strings_offset[i], len);
+  }
+}
+#endif
+
+template <typename Context>
+void SerializeOnCPU(const Context& dev_ctx,
+                    const StringTensor& src,
+                    DenseTensor* dst) {
+  int64_t numel = src.numel();
+  int64_t num = sizeof(int) * (numel + 1);
+  auto* src_str = src.data();
+  for (int64_t i = 0; i < numel; ++i) {
+    num += src_str[i].length() + 1;
+  }
+  dst->Resize(phi::make_ddim({num}));
+  uint8_t* strings_data = dev_ctx.template HostAlloc<uint8_t>(dst);
+  auto* strings_offset = reinterpret_cast<int*>(strings_data);
+  int start_offset = sizeof(int) * (numel + 1);
+  for (int64_t i = 0; i <= numel; ++i) {
+    if (i == 0) {
+      strings_offset[i] = start_offset;
+    } else {
+      strings_offset[i] = strings_offset[i - 1] + src_str[i - 1].length() + 1;
+    }
+  }
+  for (int64_t i = 0; i < numel; ++i) {
+    memcpy(strings_data + strings_offset[i],
+           src_str[i].data(),
+           src_str[i].length() + 1);
+  }
+}
+
+template <typename Context>
+void DeserializeOnCPU(const Context& dev_ctx,
+                      const DenseTensor& src,
+                      StringTensor* dst) {
+  auto* strings_data = reinterpret_cast<const char*>(src.data<uint8_t>());
+  auto* strings_offset = reinterpret_cast<const int*>(strings_data);
+  int numel = strings_offset[0] / sizeof(int) - 1;
+  dst->Resize(phi::make_ddim({numel}));
+  dtype::pstring* dst_str = dev_ctx.template HostAlloc<dtype::pstring>(dst);
+  for (int i = 0; i < numel; ++i) {
+    // -1 not include '\0'
+    auto len = strings_offset[i + 1] - strings_offset[i] - 1;
+    dst_str[i] = phi::dtype::pstring(strings_data + strings_offset[i], len);
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void SerializeOnGPU(const phi::GPUContext& dev_ctx,
+                    const StringTensor& src,
+                    DenseTensor* dst) {
+  int64_t numel = src.numel();
+  auto* src_str = src.data();
+  // 1.get the number of bytes of all strings in string tensor
+  auto strings_size = GetAllStringsSize(dev_ctx, src_str, numel);
+  strings_size += sizeof(int32_t) * (numel + 1);
+
+  dst->Resize(phi::make_ddim({strings_size}));
+  uint8_t* strings_data = dev_ctx.template Alloc<uint8_t>(dst);
+  auto* strings_offset = reinterpret_cast<int*>(strings_data);
+
+  int32_t start_offset = sizeof(int32_t) * (numel + 1);
+  // 2. serialize strings data to dense tensor
+  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+  dim3 grid_size =
+      dim3((numel + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+
+  SerializeStringsData<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      src_str, strings_data, strings_offset, numel, start_offset);
+}
+
+void DeserializeOnGPU(const phi::GPUContext& dev_ctx,
+                      const DenseTensor& src,
+                      StringTensor* dst) {
+  auto* strings_data = reinterpret_cast<const char*>(src.data<uint8_t>());
+  auto* strings_offset = reinterpret_cast<const int*>(strings_data);
+  int numel = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpySync(
+      &numel, strings_data, sizeof(numel), hipMemcpyDeviceToHost);
+#else
+  phi::backends::gpu::GpuMemcpySync(
+      &numel, strings_data, sizeof(numel), cudaMemcpyDeviceToHost);
+#endif
+  numel = numel / sizeof(int) - 1;
+  dst->Resize(phi::make_ddim({numel}));
+  dtype::pstring* dst_str = dev_ctx.template Alloc<dtype::pstring>(dst);
+
+  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+  dim3 grid_size =
+      dim3((numel + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+  DeserializeCUDAKernel<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      strings_data, strings_offset, dst_str, numel);
+}
+#endif
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/strings/gpu/copy_utils.h"
+
+using pstring = ::phi::dtype::pstring;
+
+namespace phi {
+namespace strings {
+
+__global__ void CopyFromStringTensor(pstring* dst,
+                                     const pstring* src,
+                                     int64_t num) {
+  CUDA_KERNEL_LOOP(i, num) { dst[i] = src[i]; }
+}
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const StringTensor& src,
+          bool blocking,
+          StringTensor* dst) {
+  auto* src_ptr = src.data();
+  const auto& src_place = src.place();
+  auto dst_place = dst->place();
+
+  if (src_place == dst_place &&
+      src_place.GetType() == phi::AllocationType::CPU) {
+    PADDLE_THROW(
+        phi::errors::InvalidArgument("The src and dst string tensor are all "
+                                     "CPU string tensor, you should call copy "
+                                     "function in CPU mode."));
+  }
+  VLOG(3) << "StringTensorCopy " << src.dims() << " from " << src.place()
+          << " to " << dst_place;
+
+  dst->Resize(src.dims());
+  auto* dst_ptr = dev_ctx.template Alloc<dtype::pstring>(dst);
+
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same string data async from " << src_place
+            << " to " << dst_place;
+    return;
+  }
+
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+
+  if (src_place.GetType() == phi::AllocationType::GPU &&
+      dst_place.GetType() == phi::AllocationType::CPU) {
+    // Situation 1: gpu_place->cpu_place
+    DenseTensor gpu_serialized = phi::Empty<uint8_t, GPUContext>(dev_ctx, {1});
+    phi::strings::SerializeOnGPU(dev_ctx, src, &gpu_serialized);
+
+    DenseTensor cpu_serialized;
+    cpu_serialized.Resize(gpu_serialized.dims());
+    dev_ctx.template HostAlloc<uint8_t>(&cpu_serialized);
+
+    phi::Copy(dev_ctx, gpu_serialized, dst_place, false, &cpu_serialized);
+
+    phi::strings::DeserializeOnCPU(dev_ctx, cpu_serialized, dst);
+
+  } else if (src_place.GetType() == phi::AllocationType::CPU &&
+             dst_place.GetType() == phi::AllocationType::GPU) {
+    // Situation 2: cpu_place->gpu_place
+    DenseTensor cpu_serialized;
+    cpu_serialized.Resize({1});
+    dev_ctx.template HostAlloc<uint8_t>(&cpu_serialized);
+
+    phi::strings::SerializeOnCPU(dev_ctx, src, &cpu_serialized);
+
+    DenseTensor gpu_serialized =
+        phi::EmptyLike<uint8_t>(dev_ctx, cpu_serialized);
+    phi::Copy(
+        dev_ctx, cpu_serialized, dev_ctx.GetPlace(), false, &gpu_serialized);
+
+    phi::strings::DeserializeOnGPU(dev_ctx, gpu_serialized, dst);
+  } else if (src_place.GetType() == phi::AllocationType::GPU &&
+             dst_place.GetType() == phi::AllocationType::GPU) {
+    // Situation 3: gpu_place->gpu_place
+    auto src_gpu_place = src_place;
+    auto dst_gpu_place = dst_place;
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        ctx_place.GetType(),
+        phi::AllocationType::GPU,
+        phi::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
+    int64_t numel = src.numel();
+    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+    dim3 grid_size =
+        dim3((numel + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+    // Copy
+    CopyFromStringTensor<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+        dst_ptr, src_ptr, numel);
+  }
+}
+
+}  // namespace strings
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(strings_copy,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::strings::Copy<phi::GPUContext>,
+                           pstring) {}
--- a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/strings/unicode.h"
+
+using pstring = ::phi::dtype::pstring;
+namespace phi {
+namespace strings {
+
+template <typename CharConverter>
+__global__ void StringCaseConvertCUDAKernel(pstring* out,
+                                            const pstring* in,
+                                            size_t num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    out[i] = pstring(in[i]);
+    thrust::transform(thrust::device,
+                      in[i].begin(),
+                      in[i].end(),
+                      out[i].mdata(),
+                      CharConverter());
+  }
+}
+
+template <typename CharConverter>
+struct AsciiCaseConverter<phi::GPUContext, CharConverter> {
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const pstring* in,
+                  pstring* out,
+                  size_t num) const {
+    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+    dim3 grid_size =
+        dim3((num + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+    StringCaseConvertCUDAKernel<
+        CharConverter><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+        out, in, num);
+  }
+};
+
+template <template <typename DeviceContextT> typename CharConverter>
+struct UTF8CaseConverter<phi::GPUContext, CharConverter> {
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const pstring* in,
+                  pstring* out,
+                  size_t num) const {
+    auto unicode_flag_map = GetGPUUniflagMap();
+    auto cases_map = GetGPUCharcasesMap();
+    thrust::device_vector<uint32_t> unicode_offsets(num + 1, 0);
+    uint32_t* unicode_offsets_ptr =
+        thrust::raw_pointer_cast(unicode_offsets.data());
+
+    thrust::for_each_n(thrust::device,
+                       thrust::make_counting_iterator<unsigned int>(0),
+                       num,
+                       [unicode_offsets_ptr, in] __device__(uint32_t idx) {
+                         unicode_offsets_ptr[idx + 1] =
+                             GetUnicodeStrLen(in[idx].data(), in[idx].size());
+                       });
+    uint32_t total_lengths = thrust::reduce(
+        thrust::device, unicode_offsets_ptr, unicode_offsets_ptr + num + 1, 0);
+    if (total_lengths == 0) {
+      return;
+    }
+
+    thrust::device_vector<uint32_t> unicode_output(total_lengths, 0);
+    uint32_t* unicode_output_ptr =
+        thrust::raw_pointer_cast(unicode_output.data());
+
+    CharConverter<GPUContext> converter(unicode_flag_map, cases_map);
+    thrust::for_each_n(
+        thrust::device,
+        thrust::make_counting_iterator<unsigned int>(0),
+        num,
+        [in,
+         out,
+         unicode_output_ptr,
+         unicode_offsets_ptr,
+         converter] __device__(uint32_t idx) {
+          uint32_t unicode_len =
+              unicode_offsets_ptr[idx + 1] - unicode_offsets_ptr[idx];
+          GetUnicodeStr(in[idx].data(),
+                        unicode_output_ptr + unicode_offsets_ptr[idx],
+                        unicode_len);
+          uint32_t* curr_unicode_output_ptr =
+              unicode_output_ptr + unicode_offsets_ptr[idx];
+          for (uint32_t i = 0; i < unicode_len; ++i) {
+            curr_unicode_output_ptr[i] = converter(curr_unicode_output_ptr[i]);
+          }
+          thrust::transform(thrust::device,
+                            unicode_output_ptr + unicode_offsets_ptr[idx],
+                            unicode_output_ptr + unicode_offsets_ptr[idx + 1],
+                            unicode_output_ptr + unicode_offsets_ptr[idx],
+                            converter);
+        });
+
+    thrust::device_vector<uint32_t> utf8_offsets(num + 1, 0);
+    uint32_t* utf8_offsets_ptr = thrust::raw_pointer_cast(utf8_offsets.data());
+
+    thrust::for_each_n(
+        thrust::device,
+        thrust::make_counting_iterator<unsigned int>(0),
+        num,
+        [utf8_offsets_ptr, unicode_output_ptr, unicode_offsets_ptr] __device__(
+            uint32_t idx) {
+          uint32_t unicode_len =
+              unicode_offsets_ptr[idx + 1] - unicode_offsets_ptr[idx];
+          utf8_offsets_ptr[idx + 1] = GetUTF8StrLen(
+              unicode_output_ptr + unicode_offsets_ptr[idx], unicode_len);
+        });
+    uint32_t total_utf8_lengths = thrust::reduce(
+        thrust::device, utf8_offsets_ptr, utf8_offsets_ptr + num + 1, 0);
+
+    thrust::device_vector<char> utf8_output(total_utf8_lengths, 0);
+    char* utf8_output_ptr = thrust::raw_pointer_cast(utf8_output.data());
+    thrust::for_each_n(thrust::device,
+                       thrust::make_counting_iterator<unsigned int>(0),
+                       num,
+                       [utf8_output_ptr,
+                        utf8_offsets_ptr,
+                        unicode_output_ptr,
+                        unicode_offsets_ptr,
+                        out] __device__(uint32_t idx) {
+                         uint32_t unicode_len = unicode_offsets_ptr[idx + 1] -
+                                                unicode_offsets_ptr[idx];
+                         const uint32_t* input_ptr =
+                             unicode_output_ptr + unicode_offsets_ptr[idx];
+                         char* result_ptr =
+                             utf8_output_ptr + utf8_offsets_ptr[idx];
+                         GetUTF8Str(input_ptr, result_ptr, unicode_len);
+                         out[idx] = result_ptr;
+                       });
+  }
+};
+
+template <typename ContextT>
+void StringLowerKernel(const ContextT& dev_ctx,
+                       const StringTensor& x,
+                       bool use_utf8_encoding,
+                       StringTensor* out) {
+  StringCaseConvertKernel<AsciiCaseConverter<ContextT, AsciiToLower>,
+                          UTF8CaseConverter<ContextT, UTF8ToLower>,
+                          ContextT>()(dev_ctx, x, use_utf8_encoding, out);
+}
+
+template <typename ContextT>
+void StringUpperKernel(const ContextT& dev_ctx,
+                       const StringTensor& x,
+                       bool use_utf8_encoding,
+                       StringTensor* out) {
+  StringCaseConvertKernel<AsciiCaseConverter<ContextT, AsciiToUpper>,
+                          UTF8CaseConverter<ContextT, UTF8ToUpper>,
+                          ContextT>()(dev_ctx, x, use_utf8_encoding, out);
+}
+
+}  // namespace strings
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(strings_lower,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::strings::StringLowerKernel<phi::GPUContext>,
+                           pstring) {}
+
+PD_REGISTER_GENERAL_KERNEL(strings_upper,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::strings::StringUpperKernel<phi::GPUContext>,
+                           pstring) {}
--- a/paddle/phi/kernels/strings/strings_copy_kernel.h
+++ b/paddle/phi/kernels/strings/strings_copy_kernel.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/string_tensor.h"
+
+namespace phi {
+namespace strings {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const StringTensor& src,
+          bool blocking,
+          StringTensor* dst);
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/kernels/strings/strings_empty_kernel.cc
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/strings/strings_empty_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace strings {
+
+template <typename Context>
+void EmptyKernel(const Context& dev_ctx,
+                 const ScalarArray& shape,
+                 StringTensor* out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  dev_ctx.template Alloc<dtype::pstring>(out);
+}
+
+template <typename Context>
+void EmptyLikeKernel(const Context& dev_ctx, StringTensor* out) {
+  dev_ctx.template Alloc<dtype::pstring>(out);
+}
+
+}  // namespace strings
+}  // namespace phi
+
+using pstring = ::phi::dtype::pstring;
+
+PD_REGISTER_GENERAL_KERNEL(strings_empty,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::strings::EmptyKernel<phi::CPUContext>,
+                           pstring) {}
+
+PD_REGISTER_GENERAL_KERNEL(strings_empty_like,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::strings::EmptyLikeKernel<phi::CPUContext>,
+                           pstring) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(strings_empty,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::strings::EmptyKernel<phi::GPUContext>,
+                           pstring) {}
+
+PD_REGISTER_GENERAL_KERNEL(strings_empty_like,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::strings::EmptyLikeKernel<phi::GPUContext>,
+                           pstring) {}
+#endif
--- a/paddle/phi/kernels/strings/strings_empty_kernel.h
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/infermeta/strings/nullary.h"
+#include "paddle/phi/infermeta/strings/unary.h"
+
+namespace phi {
+namespace strings {
+
+template <typename Context>
+void EmptyKernel(const Context& dev_ctx,
+                 const ScalarArray& shape,
+                 StringTensor* out);
+
+template <typename Context>
+void EmptyLikeKernel(const Context& dev_ctx, StringTensor* out);
+
+// TODO(zhoushunjie): the tensor creation method need to be replaced later,
+// all kernel api call Empty here instead of making tensor self
+template <typename Context>
+StringTensor Empty(const Context& dev_ctx, StringTensorMeta&& meta) {
+  auto allocator = std::make_unique<paddle::experimental::DefaultAllocator>(
+      dev_ctx.GetPlace());
+  phi::StringTensor string_out(allocator.get(), std::move(meta));
+  return string_out;
+}
+
+template <typename Context>
+StringTensor Empty(const Context& dev_ctx) {
+  return Empty<Context>(dev_ctx, {{-1}});
+}
+
+template <typename Context>
+StringTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
+  StringTensor string_out;
+  MetaTensor meta_out(&string_out);
+  phi::strings::CreateInferMeta(shape, &meta_out);
+  EmptyKernel<Context>(dev_ctx, shape, &string_out);
+  return string_out;
+}
+
+template <typename Context>
+StringTensor EmptyLike(const Context& dev_ctx, const StringTensor& x) {
+  StringTensor string_out;
+  MetaTensor meta_out(&string_out);
+  phi::strings::UnchangedInferMeta(x.meta(), &meta_out);
+  EmptyLikeKernel<Context>(dev_ctx, &string_out);
+  return string_out;
+}
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
+++ b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/infermeta/strings/unary.h"
+#include "paddle/phi/kernels/strings/case_utils.h"
+
+using pstring = ::phi::dtype::pstring;
+
+namespace phi {
+namespace strings {
+
+template <typename ContextT>
+void StringLowerKernel(const ContextT& dev_ctx,
+                       const StringTensor& x,
+                       bool use_utf8_encoding,
+                       StringTensor* out);
+
+template <typename ContextT>
+void StringUpperKernel(const ContextT& dev_ctx,
+                       const StringTensor& x,
+                       bool use_utf8_encoding,
+                       StringTensor* out);
+
+template <typename ContextT>
+StringTensor StringLower(const ContextT& dev_ctx,
+                         const StringTensor& x,
+                         bool use_utf8_encoding) {
+  StringTensor string_out;
+  MetaTensor meta_out(&string_out);
+  UnchangedInferMeta(x.meta(), &meta_out);
+  StringLowerKernel(dev_ctx, x, use_utf8_encoding, &string_out);
+  return string_out;
+}
+
+template <typename ContextT>
+StringTensor StringUpper(const ContextT& dev_ctx,
+                         const StringTensor& x,
+                         bool use_utf8_encoding) {
+  StringTensor string_out;
+  MetaTensor meta_out(&string_out);
+  UnchangedInferMeta(x.meta(), &meta_out);
+  StringUpperKernel(dev_ctx, x, use_utf8_encoding, &string_out);
+  return string_out;
+}
+
+template <typename AsciiCoverter, typename UTF8Converter, typename ContextT>
+struct StringCaseConvertKernel {
+  void operator()(const ContextT& dev_ctx,
+                  const StringTensor& x,
+                  bool use_utf8_encoding,
+                  StringTensor* out) {
+    AsciiCoverter ascii_converter;
+    UTF8Converter utf8_converter;
+    const pstring* in_ptr = x.data();
+    pstring* out_ptr = dev_ctx.template Alloc<pstring>(out);
+    auto num = x.numel();
+    if (!use_utf8_encoding) {
+      ascii_converter(dev_ctx, in_ptr, out_ptr, num);
+    } else {
+      utf8_converter(dev_ctx, in_ptr, out_ptr, num);
+    }
+  }
+};
+
+template <typename DeviceContext, typename CharConverter>
+struct AsciiCaseConverter {
+  void operator()(const DeviceContext& dev_ctx,
+                  const pstring* in,
+                  pstring* out,
+                  size_t num) const {
+    for (size_t i = 0; i < num; ++i) {
+      std::transform(
+          in[i].begin(), in[i].end(), out[i].mdata(), CharConverter());
+    }
+  }
+};
+
+template <typename DeviceContext,
+          template <typename DeviceContextT> typename CharConverter>
+struct UTF8CaseConverter {
+  void operator()(const DeviceContext& dev_ctx,
+                  const pstring* in,
+                  pstring* out,
+                  size_t num) const {
+    auto unicode_flag_map = GetUniFlagMap();
+    auto cases_map = GetCharcasesMap();
+    for (size_t i = 0; i < num; ++i) {
+      uint32_t unicode_len = GetUnicodeStrLen(in[i].data(), in[i].size());
+      std::vector<uint32_t> unicode_in(unicode_len, 0);
+      GetUnicodeStr(in[i].data(), unicode_in.data(), unicode_len);
+      std::transform(unicode_in.begin(),
+                     unicode_in.end(),
+                     unicode_in.begin(),
+                     CharConverter<DeviceContext>(unicode_flag_map, cases_map));
+      uint32_t utf8_len = GetUTF8StrLen(unicode_in.data(), unicode_len);
+      std::vector<char> result(utf8_len, 0);
+      GetUTF8Str(unicode_in.data(), result.data(), unicode_len);
+      out[i] = result.data();
+    }
+  }
+};
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/strings/unicode.h"
+#include <utf8proc.h>
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/kernels/strings/unicode_flag.h"
+
+namespace phi {
+namespace strings {
+
+static const void* utils_map[4] = {nullptr};
+static uint16_t CHARCASES_MAP[65536] = {0};
+
+const uint8_t* GetUniFlagMap() {
+  if (utils_map[1] == nullptr) {
+    utils_map[1] = UNIFLAG_MAP;
+  }
+  return reinterpret_cast<const uint8_t*>(utils_map[1]);
+}
+
+const uint16_t* GetCharcasesMap() {
+  if (utils_map[0] == nullptr) {
+    for (uint32_t i = 0; i < 65536; ++i) {
+      if (utf8proc_islower(i)) {
+        CHARCASES_MAP[i] = utf8proc_toupper(i);
+      } else if (utf8proc_isupper(i)) {
+        CHARCASES_MAP[i] = utf8proc_tolower(i);
+      }
+    }
+    utils_map[0] = CHARCASES_MAP;
+  }
+  return reinterpret_cast<const uint16_t*>(utils_map[0]);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+const uint8_t* GetGPUUniflagMap() {
+  if (utils_map[3] == nullptr) {
+    const uint8_t* cpu_uniflag = GetUniFlagMap();
+    auto size = sizeof(UNIFLAG_MAP);
+    uint8_t* gpu_uniflag;
+#ifdef PADDLE_WITH_HIP
+    hipMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_uniflag, cpu_uniflag, size, hipMemcpyHostToDevice);
+#else
+    cudaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_uniflag, cpu_uniflag, size, cudaMemcpyHostToDevice);
+#endif
+    utils_map[3] = gpu_uniflag;
+  }
+  return reinterpret_cast<const uint8_t*>(utils_map[3]);
+}
+
+const uint16_t* GetGPUCharcasesMap() {
+  if (utils_map[2] == nullptr) {
+    const uint16_t* cpu_charcases = GetCharcasesMap();
+    auto size = sizeof(CHARCASES_MAP);
+    uint16_t* gpu_charcases;
+#ifdef PADDLE_WITH_HIP
+    hipMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_charcases, cpu_charcases, size, hipMemcpyHostToDevice);
+#else
+    cudaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_charcases, cpu_charcases, size, cudaMemcpyHostToDevice);
+#endif
+    utils_map[2] = gpu_charcases;
+  }
+  return reinterpret_cast<const uint16_t*>(utils_map[2]);
+}
+#endif
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstring>
+#include <memory>
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/macros.h"
+
+namespace phi {
+namespace strings {
+
+HOSTDEVICE inline bool IsSpace(uint32_t chr) { return (chr & 16) > 0; }
+
+HOSTDEVICE inline bool IsAlpha(uint32_t chr) { return (chr & 8) > 0; }
+
+HOSTDEVICE inline bool IsDigit(uint32_t chr) { return (chr & 4) > 0; }
+
+HOSTDEVICE inline bool IsNumeric(uint32_t chr) { return (chr & 2) > 0; }
+
+HOSTDEVICE inline bool IsDecimal(uint32_t chr) { return (chr & 1) > 0; }
+
+HOSTDEVICE inline bool IsAlphaNum(uint32_t chr) { return (chr & 15) > 0; }
+
+HOSTDEVICE inline bool IsUpper(uint32_t chr) { return (chr & 32) > 0; }
+
+HOSTDEVICE inline bool IsLower(uint32_t chr) { return (chr & 64) > 0; }
+
+HOSTDEVICE inline uint32_t BytesInUtf8Char(uint8_t byte) {
+  unsigned int count = 1;
+  // no if-statements means no divergence
+  count += static_cast<int>((byte & 0xF0) == 0xF0);
+  count += static_cast<int>((byte & 0xE0) == 0xE0);
+  count += static_cast<int>((byte & 0xC0) == 0xC0);
+  count -= static_cast<int>((byte & 0xC0) == 0x80);
+  return count;
+}
+
+HOSTDEVICE inline uint32_t UTF8ToUInt32(const char* pSrc, uint32_t* chr) {
+  uint32_t chwidth = BytesInUtf8Char(static_cast<uint8_t>(*pSrc));
+  *chr = static_cast<uint32_t>(*pSrc++) & 0xFF;
+  if (chwidth > 1) {
+    *chr = (*chr) << 8;
+    *chr |= (static_cast<uint32_t>(*pSrc++) & 0xFF);  // << 8;
+    if (chwidth > 2) {
+      *chr = (*chr) << 8;
+      *chr |= (static_cast<uint32_t>(*pSrc++) & 0xFF);  // << 16;
+      if (chwidth > 3) {
+        *chr = (*chr) << 8;
+        *chr |= (static_cast<uint32_t>(*pSrc++) & 0xFF);  // << 24;
+      }
+    }
+  }
+  return chwidth;
+}
+
+HOSTDEVICE inline uint32_t UTF8ToUnicode(uint32_t utf8) {
+  uint32_t unchr = 0;
+  if (utf8 < 0x00000080) {
+    unchr = utf8;
+  } else if (utf8 < 0x0000E000) {
+    unchr = (utf8 & 0x1F00) >> 2;
+    unchr |= (utf8 & 0x003F);
+  } else if (utf8 < 0x00F00000) {
+    unchr = (utf8 & 0x0F0000) >> 4;
+    unchr |= (utf8 & 0x003F00) >> 2;
+    unchr |= (utf8 & 0x00003F);
+  } else if (utf8 <= static_cast<uint32_t>(0xF8000000)) {
+    unchr = (utf8 & 0x03000000) >> 6;
+    unchr |= (utf8 & 0x003F0000) >> 4;
+    unchr |= (utf8 & 0x00003F00) >> 2;
+    unchr |= (utf8 & 0x0000003F);
+  }
+  return unchr;
+}
+
+HOSTDEVICE inline uint32_t UnicodeToUTF8(uint32_t unchr) {
+  uint32_t utf8 = 0;
+  if (unchr < 0x00000080) {
+    utf8 = unchr;
+  } else if (unchr < 0x00000800) {
+    utf8 = (unchr << 2) & 0x1F00;
+    utf8 |= (unchr & 0x3F);
+    utf8 |= 0x0000C080;
+  } else if (unchr < 0x00010000) {
+    utf8 = (unchr << 4) & 0x0F0000;   // upper 4 bits
+    utf8 |= (unchr << 2) & 0x003F00;  // next 6 bits
+    utf8 |= (unchr & 0x3F);           // last 6 bits
+    utf8 |= 0x00E08080;
+  } else if (unchr < 0x00110000) {      // 3-byte unicode
+    utf8 = (unchr << 6) & 0x07000000;   // upper 3 bits
+    utf8 |= (unchr << 4) & 0x003F0000;  // next 6 bits
+    utf8 |= (unchr << 2) & 0x00003F00;  // next 6 bits
+    utf8 |= (unchr & 0x3F);             // last 6 bits
+    utf8 |= static_cast<uint32_t>(0xF0808080);
+  }
+  return utf8;
+}
+
+HOSTDEVICE inline uint32_t BytesInUnicodeChar(uint32_t chr) {
+  uint32_t count = 1;
+  // no if-statements means no divergence
+  count += static_cast<int>((chr & static_cast<uint32_t>(0x0000FF00)) > 0);
+  count += static_cast<int>((chr & static_cast<uint32_t>(0x00FF0000)) > 0);
+  count += static_cast<int>((chr & static_cast<uint32_t>(0xFF000000)) > 0);
+  return count;
+}
+
+HOSTDEVICE inline uint32_t UnicodeToUTF8Char(uint32_t chr, char* dst) {
+  uint32_t chwidth = BytesInUnicodeChar(chr);
+  for (uint32_t idx = 0; idx < chwidth; ++idx) {
+    dst[chwidth - idx - 1] = static_cast<char>(chr & 0xFF);
+    chr = chr >> 8;
+  }
+  return chwidth;
+}
+
+HOSTDEVICE inline void GetUnicodeStr(const char* pSrc,
+                                     uint32_t* unicode_str,
+                                     size_t unicode_len) {
+  uint32_t curr_unicode_char;
+  uint32_t count = UTF8ToUInt32(pSrc, &curr_unicode_char);
+  curr_unicode_char = UTF8ToUnicode(curr_unicode_char);
+  for (size_t i = 0; i < unicode_len; ++i) {
+    unicode_str[i] = curr_unicode_char;
+    pSrc += count;
+    count = UTF8ToUInt32(pSrc, &curr_unicode_char);
+    curr_unicode_char = UTF8ToUnicode(curr_unicode_char);
+  }
+}
+
+HOSTDEVICE inline uint32_t GetUnicodeStrLen(const char* pSrc, size_t size) {
+  uint32_t curr_unicode_char;
+  uint32_t count = 0;
+  uint32_t offset = 0;
+  uint32_t curr_count = 0;
+  while (offset < size) {
+    curr_count = UTF8ToUInt32(pSrc, &curr_unicode_char);
+    offset += curr_count;
+    pSrc += curr_count;
+    if (curr_count == 0) {
+      break;
+    }
+    ++count;
+  }
+  return count;
+}
+
+HOSTDEVICE inline uint32_t GetUTF8StrLen(const uint32_t* unicode_str,
+                                         size_t unicode_len) {
+  uint32_t utf8_str_count = 0;
+  for (size_t i = 0; i < unicode_len; ++i) {
+    uint32_t utf8_uint32 = UnicodeToUTF8(unicode_str[i]);
+    utf8_str_count += BytesInUnicodeChar(utf8_uint32);
+  }
+  // +1 means '\0'
+  return utf8_str_count + 1;
+}
+// Need to gurantee utf8_str has enough memory
+
+HOSTDEVICE inline void GetUTF8Str(const uint32_t* unicode_str,
+                                  char* utf8_str,
+                                  size_t unicode_len) {
+  char dst_char[5] = {0};
+  for (size_t i = 0; i < unicode_len; ++i) {
+    uint32_t utf8_uint32 = UnicodeToUTF8(unicode_str[i]);
+    uint32_t utf8_char_count = UnicodeToUTF8Char(utf8_uint32, dst_char);
+    dst_char[utf8_char_count] = '\0';
+    memcpy(utf8_str, dst_char, utf8_char_count);
+    utf8_str += utf8_char_count;
+  }
+  *utf8_str = '\0';
+}
+
+const uint8_t* GetUniFlagMap();
+const uint16_t* GetCharcasesMap();
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+const uint8_t* GetGPUUniflagMap();
+const uint16_t* GetGPUCharcasesMap();
+#endif
+
+}  // namespace strings
+}  // namespace phi
--- a/paddle/phi/kernels/strings/unicode_flag.h
+++ b/paddle/phi/kernels/strings/unicode_flag.h
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -27,3 +27,5 @@ cc_test(test_split_api SRCS test_split_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_data_transform SRCS test_data_transform.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_strings_empty_api SRCS test_strings_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_strings_lower_upper_api SRCS test_strings_lower_upper_api.cc DEPS ${COMMON_API_TEST_DEPS})
--- a/paddle/phi/tests/api/test_strings_empty_api.cc
+++ b/paddle/phi/tests/api/test_strings_empty_api.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/api/include/strings_api.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/string_tensor.h"
+
+PD_DECLARE_KERNEL(strings_empty, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(strings_empty_like, CPU, ALL_LAYOUT);
+
+namespace paddle {
+namespace tests {
+
+using phi::CPUPlace;
+using phi::StringTensor;
+using phi::StringTensorMeta;
+
+TEST(API, strings_empty) {
+  // 1. create tensor
+  auto cpu = CPUPlace();
+  const auto alloc =
+      std::make_shared<paddle::experimental::DefaultAllocator>(cpu);
+
+  auto dense_shape = std::make_shared<phi::DenseTensor>(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::INT64, phi::make_ddim({2}), phi::DataLayout::NCHW));
+  auto* shape_data =
+      dense_shape->mutable_data<int64_t>(paddle::platform::CPUPlace());
+  shape_data[0] = 2;
+  shape_data[1] = 3;
+
+  paddle::experimental::Tensor tensor_shape(dense_shape);
+
+  // 2. test API
+  auto empty_out = paddle::experimental::strings::empty(tensor_shape);
+
+  // 3. check result
+  ASSERT_EQ(empty_out.dims().size(), 2);
+  ASSERT_EQ(empty_out.dims()[0], 2);
+  ASSERT_EQ(empty_out.dims()[1], 3);
+  ASSERT_EQ(empty_out.numel(), 6);
+}
+
+TEST(API, strings_empty_like) {
+  auto cpu = CPUPlace();
+  const auto alloc =
+      std::make_shared<paddle::experimental::DefaultAllocator>(cpu);
+  // 1. create tensor
+  const phi::DDim dims({1, 2});
+  StringTensorMeta meta(dims);
+  auto cpu_strings_x = std::make_shared<phi::StringTensor>(
+      alloc.get(), phi::StringTensorMeta(meta));
+
+  // 2. test API
+  paddle::experimental::Tensor x(cpu_strings_x);
+  auto empty_like_out = paddle::experimental::strings::empty_like(x);
+
+  // 3. check result
+  ASSERT_EQ(empty_like_out.dims().size(), 2);
+  ASSERT_EQ(empty_like_out.dims()[0], 1);
+  ASSERT_EQ(empty_like_out.numel(), 2);
+}
+
+}  // namespace tests
+}  // namespace paddle
--- a/paddle/phi/tests/api/test_strings_lower_upper_api.cc
+++ b/paddle/phi/tests/api/test_strings_lower_upper_api.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/api/include/strings_api.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/string_tensor.h"
+
+PD_DECLARE_KERNEL(strings_lower, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(strings_upper, CPU, ALL_LAYOUT);
+
+namespace paddle {
+namespace tests {
+
+using phi::CPUPlace;
+using phi::StringTensor;
+using phi::StringTensorMeta;
+
+TEST(API, case_convert) {
+  auto cpu = CPUPlace();
+  const auto alloc =
+      std::make_shared<paddle::experimental::DefaultAllocator>(cpu);
+  // 1. create tensor
+  const phi::DDim dims({1, 2});
+  StringTensorMeta meta(dims);
+  auto cpu_strings_x = std::make_shared<phi::StringTensor>(
+      alloc.get(), phi::StringTensorMeta(meta));
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(phi::CPUPlace());
+
+  pstring* cpu_strings_x_data =
+      dev_ctx->template Alloc<pstring>(cpu_strings_x.get());
+  std::string strs[] = {"A Short Pstring.",
+                        "A Large Pstring Whose Length Is Longer Than 22."};
+  for (int i = 0; i < 2; ++i) {
+    cpu_strings_x_data[i] = strs[i];
+  }
+  // 2. get expected results
+  std::string expected_results[] = {strs[0], strs[0], strs[1], strs[1]};
+  std::transform(
+      strs[0].begin(), strs[0].end(), expected_results[0].begin(), ::tolower);
+  std::transform(
+      strs[0].begin(), strs[0].end(), expected_results[1].begin(), ::toupper);
+  std::transform(
+      strs[1].begin(), strs[1].end(), expected_results[2].begin(), ::tolower);
+  std::transform(
+      strs[1].begin(), strs[1].end(), expected_results[3].begin(), ::toupper);
+  // 3. test API, ascii encoding
+  paddle::experimental::Tensor x(cpu_strings_x);
+  auto lower_out = paddle::experimental::strings::lower(x, false);
+  auto upper_out = paddle::experimental::strings::upper(x, false);
+
+  auto lower_tensor =
+      std::dynamic_pointer_cast<phi::StringTensor>(lower_out.impl());
+  auto upper_tensor =
+      std::dynamic_pointer_cast<phi::StringTensor>(upper_out.impl());
+  ASSERT_EQ(lower_tensor->dims(), dims);
+  ASSERT_EQ(upper_tensor->dims(), dims);
+
+  auto lower_tensor_ptr = lower_tensor->data();
+  auto upper_tensor_ptr = upper_tensor->data();
+
+  const std::string cpu_results[] = {lower_tensor_ptr[0].data(),
+                                     upper_tensor_ptr[0].data(),
+                                     lower_tensor_ptr[1].data(),
+                                     upper_tensor_ptr[1].data()};
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ(cpu_results[i], expected_results[i]);
+  }
+}
+
+TEST(API, case_convert_utf8) {
+  auto cpu = CPUPlace();
+  const auto alloc =
+      std::make_shared<paddle::experimental::DefaultAllocator>(cpu);
+  // 1. create tensor
+  const phi::DDim dims({1, 2});
+  StringTensorMeta meta(dims);
+  auto cpu_strings_x = std::make_shared<phi::StringTensor>(
+      alloc.get(), phi::StringTensorMeta(meta));
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(phi::CPUPlace());
+
+  pstring* cpu_strings_x_data =
+      dev_ctx->template Alloc<pstring>(cpu_strings_x.get());
+  std::string strs[] = {"óÓsscHloëË", "óÓsscHloëËóÓsscHloëËóÓsscHloëË"};
+  for (int i = 0; i < 2; ++i) {
+    cpu_strings_x_data[i] = strs[i];
+  }
+  // 2. get expected results
+  std::string expected_results[] = {"óósschloëë",
+                                    "ÓÓSSCHLOËË",
+                                    "óósschloëëóósschloëëóósschloëë",
+                                    "ÓÓSSCHLOËËÓÓSSCHLOËËÓÓSSCHLOËË"};
+  // 3. test API, ascii encoding
+  paddle::experimental::Tensor x(cpu_strings_x);
+  auto lower_out = paddle::experimental::strings::lower(x, true);
+  auto upper_out = paddle::experimental::strings::upper(x, true);
+
+  auto lower_tensor =
+      std::dynamic_pointer_cast<phi::StringTensor>(lower_out.impl());
+  auto upper_tensor =
+      std::dynamic_pointer_cast<phi::StringTensor>(upper_out.impl());
+  ASSERT_EQ(lower_tensor->dims(), dims);
+  ASSERT_EQ(upper_tensor->dims(), dims);
+
+  auto lower_tensor_ptr = lower_tensor->data();
+  auto upper_tensor_ptr = upper_tensor->data();
+
+  const char* cpu_results[] = {lower_tensor_ptr[0].data(),
+                               upper_tensor_ptr[0].data(),
+                               lower_tensor_ptr[1].data(),
+                               upper_tensor_ptr[1].data()};
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ(std::string(cpu_results[i]), expected_results[i]);
+  }
+}
+
+}  // namespace tests
+}  // namespace paddle
--- a/paddle/phi/tests/common/test_data_type.cc
+++ b/paddle/phi/tests/common/test_data_type.cc
@@ -64,6 +64,9 @@ TEST(DataType, OStream) {
  oss << phi::DataType::COMPLEX128;
  EXPECT_EQ(oss.str(), "complex128");
  oss.str("");
+  oss << phi::DataType::PSTRING;
+  EXPECT_EQ(oss.str(), "pstring");
+  oss.str("");
  try {
    oss << phi::DataType::NUM_DATA_TYPES;
  } catch (const std::exception& exception) {

--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -23,5 +23,5 @@ endif()
 if (NOT WIN32)
 cc_test(test_rw_lock SRCS test_rw_lock.cc)
 endif (NOT WIN32)
-
+cc_test(test_string_tensor SRCS test_string_tensor.cc DEPS string_tensor)
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
--- a/paddle/phi/tests/core/test_string_tensor.cc
+++ b/paddle/phi/tests/core/test_string_tensor.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sstream>
+#include <string>
+#include <utility>
+#include "gtest/gtest.h"
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/tests/core/allocator.h"
+
+namespace phi {
+namespace tests {
+
+using pstring = ::phi::dtype::pstring;
+
+TEST(string_tensor, ctor) {
+  const DDim dims({1, 2});
+  StringTensorMeta meta(dims);
+  const auto string_allocator =
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace());
+  const auto alloc = string_allocator.get();
+  auto check_string_tensor = [](const StringTensor& t,
+                                const StringTensorMeta& m) -> bool {
+    bool r{true};
+    r = r && (t.numel() == product(m.dims));
+    r = r && (t.dims() == m.dims);
+    r = r && (t.place() == paddle::platform::CPUPlace());
+    r = r && t.initialized();
+    r = r && t.IsSharedWith(t);
+    r = r && (t.meta() == m);
+    return r;
+  };
+  auto cpu = CPUPlace();
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  CPUContext* cpu_ctx = reinterpret_cast<CPUContext*>(pool.Get(cpu));
+
+  StringTensor tensor_0(alloc, meta);
+  check_string_tensor(tensor_0, meta);
+
+  pstring pshort_str = pstring("A short pstring.");
+  pstring plong_str =
+      pstring("A large pstring whose length is longer than 22.");
+
+  pstring* data = cpu_ctx->template Alloc<pstring>(&tensor_0);
+  data[0] = plong_str;
+  data[1] = pshort_str;
+  CHECK_EQ(tensor_0.data()[0], plong_str);
+  CHECK_EQ(tensor_0.data()[1], pshort_str);
+
+  // Test Copy Constructor
+  StringTensor tensor_1(tensor_0);
+  CHECK_EQ(tensor_1.data()[0], plong_str);
+  CHECK_EQ(tensor_1.data()[1], pshort_str);
+
+  // Test Copy Assignment
+  StringTensor tensor_2(alloc, meta);
+  tensor_2 = tensor_1;
+  CHECK_EQ(tensor_2.data()[0], plong_str);
+  CHECK_EQ(tensor_2.data()[1], pshort_str);
+
+  // Test Move Assignment
+  StringTensor tensor_3(alloc, meta);
+  tensor_3 = std::move(tensor_1);
+  CHECK_EQ(tensor_3.data()[0], plong_str);
+  CHECK_EQ(tensor_3.data()[1], pshort_str);
+
+  tensor_3.set_meta(meta);
+}
+
+TEST(pstring, func) {
+  // Test Ctor
+  pstring empty_str;
+  pstring nchar_str(5, 'A');
+  pstring copy_nchar_str(nchar_str);
+  CHECK_EQ(empty_str, "");
+  CHECK_EQ(nchar_str, "AAAAA");
+  CHECK_EQ(copy_nchar_str, "AAAAA");
+
+  // Test Move Ctor
+  pstring move_nchar_str(std::move(nchar_str));
+  CHECK_EQ(move_nchar_str, "AAAAA");
+  pstring std_str(std::string("BBBB"));
+  CHECK_EQ(std_str, "BBBB");
+
+  pstring long_str = "A large pstring whose length is longer than 22.";
+  pstring short_str = "A short pstring.";
+
+  // Test operator+
+  pstring plus_str = move_nchar_str + std_str;
+  CHECK_EQ(plus_str, "AAAAABBBB");
+
+  // Test insert
+  plus_str.insert(5, 1, 'C');
+  CHECK_EQ(plus_str, "AAAAACBBBB");
+  plus_str.insert(5, "DDD", 0, 2);
+  CHECK_EQ(plus_str, "AAAAADDCBBBB");
+
+  // Test pushback
+  plus_str.push_back('E');
+  CHECK_EQ(plus_str, "AAAAADDCBBBBE");
+
+  // Test append
+  plus_str.append("FF");
+  CHECK_EQ(plus_str, "AAAAADDCBBBBEFF");
+  plus_str.append(2, 'G');
+  CHECK_EQ(plus_str, "AAAAADDCBBBBEFFGG");
+
+  // Test operator[]
+  CHECK_EQ(long_str[0], 'A');
+  CHECK_EQ(short_str[0], 'A');
+
+  // Test capacity
+  CHECK_EQ(short_str.capacity(), 22UL);
+
+  // Test reserve
+  pstring reserve_str;
+  CHECK_EQ(reserve_str.capacity(), 22UL);
+  // small -> large
+  reserve_str.reserve(100);
+  CHECK_EQ(reserve_str.capacity(), 111UL);  // align(100) - 1 = 111
+  // reserve more memory
+  reserve_str.reserve(200);
+  CHECK_EQ(reserve_str.capacity(), 207UL);  // align(200) - 1 = 207
+
+  // Test operator<<
+  std::ostringstream oss1, oss2;
+  oss1 << long_str;
+  CHECK_EQ(oss1.str(), long_str);
+
+  // Test iterator
+  for (auto it = long_str.begin(); it != long_str.end(); ++it) {
+    oss2 << *it;
+  }
+  CHECK_EQ(oss2.str(), long_str);
+
+  // Test comparision operators
+  CHECK_EQ((long_str < short_str), true);
+  CHECK_EQ((long_str > short_str), false);
+  CHECK_EQ((long_str == short_str), false);
+  CHECK_EQ((long_str != short_str), true);
+  CHECK_EQ((short_str < long_str), false);
+  CHECK_EQ((short_str > long_str), true);
+  CHECK_EQ((move_nchar_str < plus_str), true);
+  CHECK_EQ((plus_str > move_nchar_str), true);
+
+  // Test empty
+  CHECK_EQ(empty_str.empty(), true);
+  CHECK_EQ(nchar_str.empty(), false);
+  CHECK_EQ(empty_str.length(), 0UL);
+
+  // Test Resize
+  nchar_str.resize(6, 'B');
+  CHECK_EQ(nchar_str, "AAAAAB");
+
+  // Test operator =
+  long_str = std::move(nchar_str);
+  CHECK_EQ(long_str, "AAAAAB");
+  long_str = short_str;
+  CHECK_EQ(short_str, long_str);
+  short_str = 'A';
+  CHECK_EQ(short_str, "A");
+  short_str = std::move(copy_nchar_str);
+  CHECK_EQ(short_str, "AAAAA");
+}
+
+}  // namespace tests
+}  // namespace phi
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -25,3 +25,18 @@ if(WITH_ROCM)
 endif()

 cc_test(test_cpu_vec SRCS test_cpu_vec.cc DEPS blas cpu_info)
+
+# For String Kernels
+cc_test(test_strings_lower_upper_dev_api SRCS test_strings_lower_upper_dev_api.cc DEPS phi phi_api_utils)
+IF(WITH_GPU)
+  nv_test(test_strings_lower_upper_dev_gpu_api SRCS test_strings_lower_upper_dev_api.cu DEPS phi phi_api_utils)
+ELSEIF(WITH_ROCM)
+  hip_test(test_strings_lower_upper_dev_gpu_api SRCS test_strings_lower_upper_dev_api.cu DEPS phi phi_api_utils)
+ENDIF()
+
+cc_test(test_strings_copy_dev_api SRCS test_strings_copy_dev_api.cc DEPS phi phi_api_utils)
+IF(WITH_GPU)
+  nv_test(test_strings_copy_dev_gpu_api SRCS test_strings_copy_dev_api.cu DEPS phi phi_api_utils)
+ELSEIF(WITH_ROCM)
+  hip_test(test_strings_copy_dev_gpu_api SRCS test_strings_copy_dev_api.cu DEPS phi phi_api_utils)
+ENDIF()
--- a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
+
+namespace phi {
+namespace tests {
+
+using DDim = phi::DDim;
+using pstring = phi::dtype::pstring;
+
+TEST(DEV_API, strings_copy) {
+  // 1. create tensor
+  const DDim dims({2, 3});
+  StringTensorMeta meta(dims);
+
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
+
+  const auto string_allocator =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
+  const auto alloc = string_allocator.get();
+  StringTensor string_src(alloc, meta);
+  StringTensor string_dst(alloc, meta);
+
+  // 2. Assign input text
+  const char* input[] = {"A Short Pstring.",
+                         "A Large Pstring Whose Length Is Longer Than 22.",
+                         "abc",
+                         "defg",
+                         "hijklmn",
+                         "opqrst"};
+  pstring* string_src_data = dev_ctx->template Alloc<pstring>(&string_src);
+
+  for (int i = 0; i < string_src.numel(); ++i) {
+    string_src_data[i] = input[i];
+  }
+  phi::strings::Copy(*dev_ctx, string_src, false, &string_dst);
+  for (int64_t i = 0; i < string_src.numel(); i++) {
+    ASSERT_EQ(string_src.data()[i], string_dst.data()[i]);
+  }
+}
+
+}  // namespace tests
+}  // namespace phi
--- a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
+++ b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
+#include "paddle/phi/kernels/strings/strings_empty_kernel.h"
+
+namespace phi {
+namespace tests {
+
+using DDim = phi::DDim;
+using pstring = phi::dtype::pstring;
+
+TEST(DEV_API, strings_copy) {
+  // 1. create tensor
+  const DDim dims({2, 3});
+  StringTensorMeta meta(dims);
+
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+
+  auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
+  auto* gpu_dev_ctx =
+      reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
+
+  StringTensor string_src = phi::strings::Empty(*dev_ctx, std::move(meta));
+  StringTensor string_dst = phi::strings::Empty(*dev_ctx, std::move(meta));
+  // 2. Assign input text
+  const char* input[] = {"A Short Pstring.",
+                         "A Large Pstring Whose Length Is Longer Than 22.",
+                         "abc",
+                         "defg",
+                         "hijklmn",
+                         "opqrst"};
+  pstring* string_src_data = dev_ctx->template Alloc<pstring>(&string_src);
+  for (int i = 0; i < string_src.numel(); ++i) {
+    string_src_data[i] = input[i];
+  }
+  StringTensor string_gpu1 = phi::strings::Empty(*gpu_dev_ctx, std::move(meta));
+  StringTensor string_gpu2 = phi::strings::Empty(*gpu_dev_ctx, std::move(meta));
+
+  // cpu->gpu
+  phi::strings::Copy(*gpu_dev_ctx, string_src, false, &string_gpu1);
+  // gpu->gpu
+  phi::strings::Copy(*gpu_dev_ctx, string_gpu1, false, &string_gpu2);
+  // gpu->cpu
+  phi::strings::Copy(*gpu_dev_ctx, string_gpu2, false, &string_dst);
+  for (int64_t i = 0; i < string_src.numel(); i++) {
+    ASSERT_EQ(string_src.data()[i], string_dst.data()[i]);
+  }
+}
+
+}  // namespace tests
+}  // namespace phi
--- a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/kernels/strings/strings_empty_kernel.h"
+#include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+namespace phi {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = phi::DDim;
+using pstring = ::phi::dtype::pstring;
+
+TEST(DEV_API, strings_cast_convert) {
+  // 1. create tensor
+  const DDim dims({1, 2});
+  StringTensorMeta meta(dims);
+  const auto string_allocator =
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace());
+  const auto alloc = string_allocator.get();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  StringTensor dense_x(alloc, meta);
+
+  std::string short_str = "A Short Pstring.";
+  std::string long_str = "A Large Pstring Whose Length Is Longer Than 22.";
+
+  pstring* dense_x_data = dev_ctx->template Alloc<pstring>(&dense_x);
+  dense_x_data[0] = short_str;
+  dense_x_data[1] = long_str;
+
+  // 2. get expected results
+  std::string expected_results[] = {short_str, short_str, long_str, long_str};
+  std::transform(short_str.begin(),
+                 short_str.end(),
+                 expected_results[0].begin(),
+                 ::tolower);
+  std::transform(short_str.begin(),
+                 short_str.end(),
+                 expected_results[1].begin(),
+                 ::toupper);
+  std::transform(
+      long_str.begin(), long_str.end(), expected_results[2].begin(), ::tolower);
+  std::transform(
+      long_str.begin(), long_str.end(), expected_results[3].begin(), ::toupper);
+
+  // 3. test API, ascii encoding
+  auto dense_lower_out = phi::strings::StringLower(
+      *(static_cast<phi::CPUContext*>(dev_ctx)), dense_x, false);
+  auto dense_upper_out = phi::strings::StringUpper(
+      *(static_cast<phi::CPUContext*>(dev_ctx)), dense_x, false);
+
+  // 4. check results
+  ASSERT_EQ(dense_lower_out.numel(), 2);
+  ASSERT_EQ(dense_upper_out.numel(), 2);
+
+  // lower case
+  ASSERT_EQ(dense_lower_out.data()[0].data(), expected_results[0]);
+  ASSERT_EQ(dense_lower_out.data()[1].data(), expected_results[2]);
+
+  // upper case
+  ASSERT_EQ(dense_upper_out.data()[0].data(), expected_results[1]);
+  ASSERT_EQ(dense_upper_out.data()[1].data(), expected_results[3]);
+}
+
+TEST(DEV_API, strings_cast_convert_utf8) {
+  // 1. create tensor
+  const DDim dims({1, 1});
+  StringTensorMeta meta(dims);
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  const auto string_allocator =
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace());
+  const auto alloc = string_allocator.get();
+  StringTensor dense_x(alloc, meta);
+
+  std::string utf8_str = "óÓsscHloëËóÓsscHloëËóÓsscHloëË";
+
+  pstring* dense_x_data = dev_ctx->template Alloc<pstring>(&dense_x);
+  dense_x_data[0] = utf8_str;
+
+  // 2. get expected results
+  std::string expected_results[] = {"óósschloëëóósschloëëóósschloëë",
+                                    "ÓÓSSCHLOËËÓÓSSCHLOËËÓÓSSCHLOËË"};
+
+  // 3. test API, ascii encoding
+  auto dense_lower_out = phi::strings::StringLower(
+      *(static_cast<phi::CPUContext*>(dev_ctx)), dense_x, true);
+  auto dense_upper_out = phi::strings::StringUpper(
+      *(static_cast<phi::CPUContext*>(dev_ctx)), dense_x, true);
+
+  // 4. check results
+  ASSERT_EQ(dense_lower_out.numel(), 1);
+  ASSERT_EQ(dense_upper_out.numel(), 1);
+
+  // lower case
+  VLOG(0) << dense_lower_out.data()[0].data();
+  ASSERT_EQ(dense_lower_out.data()[0].data(), expected_results[0]);
+
+  // upper case
+  ASSERT_EQ(dense_upper_out.data()[0].data(), expected_results[1]);
+}
+
+}  // namespace tests
+}  // namespace phi
--- a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
+++ b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <stdio.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#if (defined(__NVCC__) || defined(__HIPCC__))
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#endif
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/pstring.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
+#include "paddle/phi/kernels/strings/strings_empty_kernel.h"
+#include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
+namespace phi {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = phi::DDim;
+using pstring = ::phi::dtype::pstring;
+using phi::GPUPlace;
+using phi::CPUPlace;
+
+TEST(DEV_API, strings_cast_convert) {
+  auto gpu0 = GPUPlace();
+  auto cpu = CPUPlace();
+
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  GPUContext* dev_ctx = reinterpret_cast<GPUContext*>(pool.Get(gpu0));
+  CPUContext* cpu_ctx = reinterpret_cast<CPUContext*>(pool.Get(cpu));
+
+  // 1. create tensor
+  const DDim dims({1, 2});
+  StringTensorMeta meta(dims);
+  StringTensor gpu_strings_x = phi::strings::Empty(*dev_ctx, std::move(meta));
+  StringTensor cpu_strings_x = phi::strings::Empty(*cpu_ctx, std::move(meta));
+  StringTensor cpu_strings_lower_out =
+      phi::strings::Empty(*cpu_ctx, std::move(meta));
+  StringTensor cpu_strings_upper_out =
+      phi::strings::Empty(*cpu_ctx, std::move(meta));
+
+  std::string short_str = "A Short Pstring.";
+  std::string long_str = "A Large Pstring Whose Length Is Longer Than 22.";
+
+  pstring* cpu_strings_x_data =
+      cpu_ctx->template Alloc<pstring>(&cpu_strings_x);
+  cpu_strings_x_data[0] = short_str;
+  cpu_strings_x_data[1] = long_str;
+
+  phi::strings::Copy(*dev_ctx, cpu_strings_x, false, &gpu_strings_x);
+
+  // 2. get expected results
+  std::string expected_results[] = {short_str, short_str, long_str, long_str};
+  std::transform(short_str.begin(),
+                 short_str.end(),
+                 expected_results[0].begin(),
+                 ::tolower);
+  std::transform(short_str.begin(),
+                 short_str.end(),
+                 expected_results[1].begin(),
+                 ::toupper);
+  std::transform(
+      long_str.begin(), long_str.end(), expected_results[2].begin(), ::tolower);
+  std::transform(
+      long_str.begin(), long_str.end(), expected_results[3].begin(), ::toupper);
+
+  // 3. test API, ascii encoding
+  auto gpu_strings_lower_out =
+      phi::strings::StringLower(*dev_ctx, gpu_strings_x, false);
+  auto gpu_strings_upper_out =
+      phi::strings::StringUpper(*dev_ctx, gpu_strings_x, false);
+
+  phi::strings::Copy(
+      *dev_ctx, gpu_strings_lower_out, false, &cpu_strings_lower_out);
+  phi::strings::Copy(
+      *dev_ctx, gpu_strings_upper_out, false, &cpu_strings_upper_out);
+
+  // 4. check results
+  ASSERT_EQ(gpu_strings_lower_out.numel(), 2);
+  ASSERT_EQ(gpu_strings_upper_out.numel(), 2);
+  const char* cpu_results[] = {cpu_strings_lower_out.data()[0].data(),
+                               cpu_strings_upper_out.data()[0].data(),
+                               cpu_strings_lower_out.data()[1].data(),
+                               cpu_strings_upper_out.data()[1].data()};
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ(cpu_results[i], expected_results[i]);
+  }
+}
+
+TEST(DEV_API, strings_cast_convert_utf8) {
+  auto gpu0 = GPUPlace();
+  auto cpu = CPUPlace();
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  GPUContext* dev_ctx = reinterpret_cast<GPUContext*>(pool.Get(gpu0));
+  CPUContext* cpu_ctx = reinterpret_cast<CPUContext*>(pool.Get(cpu));
+
+  // 1. create tensor
+  const DDim dims({1, 1});
+  StringTensorMeta meta(dims);
+  StringTensor gpu_strings_x = phi::strings::Empty(*dev_ctx, std::move(meta));
+  StringTensor cpu_strings_x = phi::strings::Empty(*cpu_ctx, std::move(meta));
+  StringTensor cpu_strings_lower_out =
+      phi::strings::Empty(*cpu_ctx, std::move(meta));
+  StringTensor cpu_strings_upper_out =
+      phi::strings::Empty(*cpu_ctx, std::move(meta));
+  std::string utf8_str = "óÓsscHloëË";
+  pstring* cpu_strings_x_data =
+      cpu_ctx->template Alloc<pstring>(&cpu_strings_x);
+
+  cpu_strings_x_data[0] = utf8_str;
+  phi::strings::Copy(*dev_ctx, cpu_strings_x, false, &gpu_strings_x);
+
+  // 2. get expected results
+  std::string expected_results[] = {"óósschloëë", "ÓÓSSCHLOËË"};
+
+  // 3. test API, ascii encoding
+  auto gpu_strings_lower_out =
+      phi::strings::StringLower(*dev_ctx, gpu_strings_x, true);
+  auto gpu_strings_upper_out =
+      phi::strings::StringUpper(*dev_ctx, gpu_strings_x, true);
+  phi::strings::Copy(
+      *dev_ctx, gpu_strings_lower_out, false, &cpu_strings_lower_out);
+  phi::strings::Copy(
+      *dev_ctx, gpu_strings_upper_out, false, &cpu_strings_upper_out);
+
+  // 4. check results
+  const char* cpu_results[] = {cpu_strings_lower_out.data()[0].data(),
+                               cpu_strings_upper_out.data()[0].data()};
+  ASSERT_EQ(cpu_strings_lower_out.numel(), 1);
+  ASSERT_EQ(cpu_strings_upper_out.numel(), 1);
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_EQ(cpu_results[i], expected_results[i]);
+  }
+}
+
+}  // namespace tests
+}  // namespace phi
--- a/python/paddle/utils/code_gen/strings_api.yaml
+++ b/python/paddle/utils/code_gen/strings_api.yaml
+- api : empty
+  args : (ScalarArray shape, Place place=CPUPlace())
+  output : Tensor(out@StringTensor)
+  infer_meta :
+    func : strings::CreateInferMeta
+    param : [shape]
+  kernel :
+    func : strings_empty
+    param : [shape]
+    backend : place
+
+- api : empty_like
+  args : (Tensor x, Place place = {})
+  output : Tensor(out@StringTensor)
+  infer_meta :
+    func : strings::CreateLikeInferMeta
+    param : [x]
+  kernel :
+    func : strings_empty_like
+    param : [x]
+    backend : place > x
+
+- api : lower
+  args : (Tensor x, bool use_utf8_encoding)
+  output : Tensor(out@StringTensor)
+  infer_meta :
+    func : strings::CreateLikeInferMeta
+    param : [x]
+  kernel :
+    func : strings_lower
+
+- api : upper
+  args : (Tensor x, bool use_utf8_encoding)
+  output : Tensor(out@StringTensor)
+  infer_meta :
+    func : strings::CreateLikeInferMeta
+    param : [x]
+  kernel :
+    func : strings_upper
--- a/python/paddle/utils/code_gen/strings_api_gen.py
+++ b/python/paddle/utils/code_gen/strings_api_gen.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+from api_gen import ForwardAPI
+PREFIX_TENSOR_NAME = 'input_'
+PREFIX_META_TENSOR_NAME = 'meta_'
+
+
+class StringsAPI(ForwardAPI):
+    def __init__(self, api_item_yaml):
+        super(StringsAPI, self).__init__(api_item_yaml)
+
+    def get_api_func_name(self):
+        return self.api
+
+    def gene_api_declaration(self):
+        return f"""
+// {", ".join(self.outputs['names'])}
+PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+"""
+
+    def get_kernel_tensor_out_type(self, output_name):
+        strings_type = 'TensorType::DENSE_TENSOR'
+        if output_name.endswith('@StringTensor'):
+            strings_type = 'TensorType::STRING_TENSOR'
+        return strings_type
+
+    def get_tensor_type(self, kernel_tensor_out_type):
+        tensor_type_dict = {
+            "TensorType::DENSE_TENSOR": "phi::DenseTensor",
+            "TensorType::STRING_TENSOR": "phi::StringTensor",
+        }
+        return tensor_type_dict[kernel_tensor_out_type]
+
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            kernel_tensor_out_type = self.get_kernel_tensor_out_type(
+                self.outputs['names'][0])
+            tensor_type = self.get_tensor_type(kernel_tensor_out_type)
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  
+  {tensor_type}* kernel_out = dynamic_cast<{tensor_type}*>({set_out_func}(kernel_backend, &api_output, {kernel_tensor_out_type}));"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} api_output;"""
+
+            for i in range(len(output_type_list)):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                kernel_tensor_out_type = self.get_kernel_tensor_out_type(
+                    self.outputs['names'][i])
+                tensor_type = self.get_tensor_type(kernel_tensor_out_type)
+                if inplace_flag and self.inplace_map is not None and self.outputs[
+                        'names'][i] in self.inplace_map:
+                    output_create = output_create + f"""
+  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
+
+                output_create = output_create + f"""
+  {tensor_type}* kernel_out_{i} = dynamic_cast<{tensor_type}*>({set_out_func}(&std::get<{i}>(api_output), {kernel_tensor_out_type}));"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_names, output_create
+
+    def get_kernel_args(self, code_indent):
+        input_trans_map = {
+            'const Tensor&': 'const phi::StringTensor&',
+            'const std::vector<Tensor>&':
+            'const std::vector<const phi::StringTensor*>&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::StringTensor&>',
+            'const paddle::optional<std::vector<Tensor>>&':
+            'paddle::optional<const std::vector<phi::StringTensor>&>'
+        }
+        out_trans_map = {
+            'Tensor': 'phi::StringTensor*',
+            'std::vector<Tensor>': 'std::vector<phi::StringTensor*>&'
+        }
+        input_names = self.inputs['names']
+        input_infos = self.inputs['input_info']
+        kernel_args_type_list = ['const platform::DeviceContext&']
+
+        attr_names = self.attrs['names']
+        kernel_param = self.kernel['param']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+        input_tensor_code = ""
+        # set input_tensor_code
+        for i, input_name in enumerate(input_names):
+            input_tensor_code = input_tensor_code + f"""
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = TensorToStringTensor({input_name});"""
+
+        # set kernel_args
+        kernel_args = "*dev_ctx, "
+        for param in kernel_param:
+            if param in input_names:
+                if param in self.optional_vars:
+                    kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                else:
+                    if self.inputs['input_info'][param] == "const Tensor&":
+                        kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                    elif self.inputs['input_info'][
+                            input_name] == "const std::vector<Tensor>&":
+                        kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                    else:
+                        # do nothing
+                        pass
+                kernel_in_type = input_trans_map[input_infos[param]]
+                kernel_args_type_list.append(kernel_in_type)
+            elif param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in self.attrs['attr_info'][param][0]:
+                    kernel_args_type_list.append('const phi::ScalarArray&')
+                    param = 'phi::ScalarArray(' + param + ')'
+                elif 'Scalar' in self.attrs['attr_info'][param][0]:
+                    kernel_args_type_list.append('const phi::Scalar&')
+                    param = 'phi::Scalar(' + param + ')'
+                else:
+                    kernel_args_type_list.append(self.attrs['attr_info'][param][
+                        0])
+                kernel_args = kernel_args + param + ", "
+            elif isinstance(param, bool):
+                kernel_args = kernel_args + str(param).lower() + ", "
+            else:
+                kernel_args = kernel_args + str(param) + ", "
+
+        for out_type in self.outputs['types']:
+            kernel_args_type_list.append(out_trans_map[out_type])
+
+        # set kernel_signature
+        kernel_signature = "void(*)(" + ", ".join(kernel_args_type_list) + ")"
+
+        return input_tensor_code, kernel_args[:-2], kernel_signature
+
+    def gen_string_tensor_kernel_code(self, inplace_flag=False, code_indent=""):
+        input_tensors, kernel_args, kernel_signature = self.get_kernel_args(
+            code_indent)
+        outputs_args, kernel_output_names, output_create = self.gene_output(
+            self.outputs['types'], 'SetStringsKernelOutput', '', inplace_flag)
+
+        return f"""
+  // 1. Get kernel signature and kernel
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} api strings kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} api strings kernel: " << kernel;
+
+  // 2. Get Device Context and input
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  {input_tensors}
+
+  //  3. Set output
+  {output_create}
+{self.gene_infer_meta(kernel_output_names, code_indent)}
+
+  // 4. run kernel
+
+{code_indent}  using kernel_signature = {kernel_signature};
+{code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+
+{code_indent}  return {self.gene_return_code()};"""
+
+    def gene_kernel_select(self) -> str:
+        api = self.api
+        input_names = self.inputs['names']
+        attrs = self.attrs
+        kernel = self.kernel
+
+        kernel_key_item_init = """
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::PSTRING_UNION;
+  DataType kernel_data_type = DataType::PSTRING;
+"""
+        # Check the tensor options
+        attr_backend_count = 0
+        attr_layout_count = 0
+        attr_data_type_count = 0
+        for attr_name in attrs['names']:
+            if attrs['attr_info'][attr_name][0] == 'Backend':
+                assert kernel['backend'] is not None, \
+                    f"{api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+                attr_backend_count = attr_backend_count + 1
+
+        # preprocess kernel configures
+        kernel_select_code = ""
+        if kernel['backend'] is not None:
+            if '>' in kernel['backend']:
+                vars_list = kernel['backend'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Place'), \
+                    f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                args_str = ""
+                for ele in kernel['backend'].split(','):
+                    args_str = args_str + ele.strip() + ', '
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackend({args_str[:-2]});
+"""
+
+        kernel_select_args = ""
+        for input_name in input_names:
+            kernel_select_args = kernel_select_args + input_name + ", "
+
+        if len(kernel_select_args) > 2:
+            kernel_select_args = kernel_select_args[:-2]
+
+        kernel_select_code = kernel_key_item_init + kernel_select_code
+
+        if len(input_names) > 0:
+            if self.support_selected_rows_kernel:
+                kernel_select_code = kernel_select_code + f"""
+  KernelType kernel_type = ParseKernelTypeByInputArgs({", ".join(input_names)});
+"""
+
+            kernel_select_code = kernel_select_code + f"""
+  auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  kernel_backend = kernel_key.backend();"""
+
+        return kernel_select_code
+
+    def gene_base_api_code(self, inplace_flag=False):
+        api_func_name = self.get_api_func_name()
+        return f"""
+PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+{self.gene_kernel_select()}
+{self.gen_string_tensor_kernel_code(inplace_flag)}
+}}
+"""
+
+
+def header_include():
+    return """
+#include <tuple>
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/core/kernel_context.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/infermeta/strings/nullary.h"
+#include "paddle/phi/infermeta/strings/unary.h"
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/core/kernel_registry.h"
+"""
+
+
+def api_register():
+    return """
+PD_REGISTER_API(StringsApi);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace strings {
+
+""", """
+
+}  // namespace strings
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/phi/api/include/strings_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        strings_api = StringsAPI(api)
+        header_file.write(strings_api.gene_api_declaration())
+        source_file.write(strings_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    # source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Strings API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/strings_api.yaml')
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/include/strings_api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/strings_api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -225,6 +225,9 @@ def gen_dtype(vals: List[str]):
        elif val == "complex<double>" or val == "complex128":
            ir_dtypes.append("complex128")
            origin_dtypes.append("paddle::experimental::complex128")
+        elif val == "pstring":
+            ir_dtypes.append("pstring")
+            origin_dtypes.append("paddle::experimental::pstring")
        elif val == "ALL_DTYPE":
            ir_dtypes.append("all")
            origin_dtypes.append("all")