Add copy from tensor (#34406)

* add api * temp save * revert * copytocpu async ok * fix style * copy sync ok * fix compile error * fix compile error * api done * update python async api * fix compile * remove async python api; add c++ async unittest * remove python async api * update unittest * update unittest * add C++ unittest for copytensor * add unittest * update namespace utils to class TensorUtils * add unittest * update unittest * update unittest * update code style * update code style * update unittest

Add copy from tensor (#34406)
* add api * temp save * revert * copytocpu async ok * fix style * copy sync ok * fix compile error * fix compile error * api done * update python async api * fix compile * remove async python api; add c++ async unittest * remove python async api * update unittest * update unittest * add C++ unittest for copytensor * add unittest * update namespace utils to class TensorUtils * add unittest * update unittest * update unittest * update code style * update code style * update unittest
ac33c0ca · Shang Zhizhou · GitHub · 223c01fd · ac33c0ca · ac33c0ca
11 changed file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -20,6 +20,10 @@ if(WITH_TESTING)
    add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
+if(WITH_INFERENCE_API_TEST)
+    add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
+endif(WITH_INFERENCE_API_TEST)
 if(NOT WITH_PROFILER)
    add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -28,14 +28,15 @@ if(WITH_MKLDNN)
 endif()
 cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
+cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 if(WITH_CRYPTO)
    cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
+              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
 else()
    cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
+              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
 endif()
 if(WIN32)

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -121,6 +121,8 @@ DataType Tensor::type() const {
  return DataType::FLOAT32;
 }
+PlaceType Tensor::place() const { return place_; }
 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
  EAGER_GET_TENSOR;
@@ -185,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) {
 }
 template <typename T>
-void Tensor::CopyToCpu(T *data) {
+void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
+                           void *cb_params) const {
  EAGER_GET_TENSOR;
  auto ele_num = tensor->numel();
  auto *t_data = tensor->data<T>();
@@ -222,7 +225,16 @@ void Tensor::CopyToCpu(T *data) {
 #ifdef PADDLE_WITH_HIP
    hipStreamSynchronize(dev_ctx->stream());
 #else
+    // async, return stream
+    if (nullptr != exec_stream) {
+      *(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
+      // async with callback
+    } else if (cb) {
+      cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
+      // sync
+    } else {
      cudaStreamSynchronize(dev_ctx->stream());
+    }
 #endif
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
@@ -261,6 +273,22 @@ void Tensor::CopyToCpu(T *data) {
        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
  }
 }
+template <typename T>
+void Tensor::CopyToCpu(T *data) const {
+  CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
+}
+template <typename T>
+void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const {
+  CopyToCpuImpl<T>(data, exec_stream, nullptr, nullptr);
+}
+template <typename T>
+void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const {
+  CopyToCpuImpl<T>(data, nullptr, cb, cb_params);
+}
 template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int64_t>(const int64_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
@@ -268,12 +296,38 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data);
+template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
-template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data);
+template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
-template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data);
+template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
-template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data);
+template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
-template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data);
+template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
-template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data);
+template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
+    float *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
+    int64_t *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
+    int32_t *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
+    uint8_t *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
+    int8_t *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
+    float16 *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
+    float *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
+    int64_t *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
+    int32_t *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
+    uint8_t *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
+    int8_t *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
+    float16 *data, CallbackFunc cb, void *cb_params) const;
 template PD_INFER_DECL float *Tensor::data<float>(PlaceType *place,
                                                  int *size) const;
@@ -285,12 +339,15 @@ template PD_INFER_DECL uint8_t *Tensor::data<uint8_t>(PlaceType *place,
                                                      int *size) const;
 template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
                                                    int *size) const;
+template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
+                                                      int *size) const;
 template PD_INFER_DECL float *Tensor::mutable_data<float>(PlaceType place);
 template PD_INFER_DECL int64_t *Tensor::mutable_data<int64_t>(PlaceType place);
 template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
 template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
+template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
 Tensor::Tensor(void *scope) : scope_{scope} {
  PADDLE_ENFORCE_NOT_NULL(scope_,

--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+namespace paddle_infer {
+namespace contrib {
+using paddle::PaddleDType;
+void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
+#if defined(PADDLE_WITH_CUDA)
+  void* ptr = nullptr;
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
+  return ptr;
+#else
+  return nullptr;
+#endif
+}
+void TensorUtils::CudaFreePinnedMemory(void* ptr) {
+#if defined(PADDLE_WITH_CUDA)
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
+#endif
+}
+void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src,
+                                 void* exec_stream, CallbackFunc cb,
+                                 void* cb_params) {
+  Tensor& dst = *p_dst;
+  dst.Reshape(src.shape());
+  PADDLE_ENFORCE(
+      src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU,
+      paddle::platform::errors::InvalidArgument(
+          "CopyTensor only support PlaceType kCPU/kGPU now."));
+  PADDLE_ENFORCE(
+      dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU,
+      paddle::platform::errors::InvalidArgument(
+          "CopyTensor only support PlaceType kCPU/kGPU now."));
+  // copy to cpu, gpu => cpu or cpu => cpu
+  if (dst.place() == PlaceType::kCPU) {
+    switch (src.type()) {
+      case PaddleDType::INT32:
+        src.CopyToCpuImpl(dst.mutable_data<int32_t>(PlaceType::kCPU),
+                          exec_stream, cb, cb_params);
+        break;
+      case PaddleDType::INT64:
+        src.CopyToCpuImpl(dst.mutable_data<int64_t>(PlaceType::kCPU),
+                          exec_stream, cb, cb_params);
+        break;
+      case PaddleDType::FLOAT32:
+        src.CopyToCpuImpl(dst.mutable_data<float>(PlaceType::kCPU), exec_stream,
+                          cb, cb_params);
+        break;
+      case PaddleDType::UINT8:
+        src.CopyToCpuImpl(dst.mutable_data<uint8_t>(PlaceType::kCPU),
+                          exec_stream, cb, cb_params);
+        break;
+      case PaddleDType::INT8:
+        src.CopyToCpuImpl(dst.mutable_data<int8_t>(PlaceType::kCPU),
+                          exec_stream, cb, cb_params);
+        break;
+      case PaddleDType::FLOAT16:
+        src.CopyToCpuImpl(
+            dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU),
+            exec_stream, cb, cb_params);
+        break;
+      default:
+        PADDLE_THROW(paddle::platform::errors::Unimplemented(
+            "Only INT32, INT64, UINT8, INT8, FLOAT16 and "
+            "FLOAT32 is supported in Tensor. Others not implements"));
+    }
+    // gpu => gpu or cpu => gpu
+  } else {
+#if defined(PADDLE_WITH_CUDA)
+    void* dst_data = nullptr;
+    void* src_data = nullptr;
+    size_t data_len = 0;
+    int data_size = 0;
+    PlaceType src_place;
+    switch (src.type()) {
+      case PaddleDType::INT32:
+        dst_data =
+            static_cast<void*>(dst.mutable_data<int32_t>(PlaceType::kGPU));
+        src_data =
+            static_cast<void*>(src.data<int32_t>(&src_place, &data_size));
+        data_len = data_size * sizeof(int32_t);
+        break;
+      case PaddleDType::INT64:
+        dst_data =
+            static_cast<void*>(dst.mutable_data<int64_t>(PlaceType::kGPU));
+        src_data =
+            static_cast<void*>(src.data<int64_t>(&src_place, &data_size));
+        data_len = data_size * sizeof(int64_t);
+        break;
+      case PaddleDType::FLOAT32:
+        dst_data = static_cast<void*>(dst.mutable_data<float>(PlaceType::kGPU));
+        src_data = static_cast<void*>(src.data<float>(&src_place, &data_size));
+        data_len = data_size * sizeof(float);
+        break;
+      case PaddleDType::UINT8:
+        dst_data =
+            static_cast<void*>(dst.mutable_data<uint8_t>(PlaceType::kGPU));
+        src_data =
+            static_cast<void*>(src.data<uint8_t>(&src_place, &data_size));
+        data_len = data_size * sizeof(uint8_t);
+        break;
+      case PaddleDType::INT8:
+        dst_data =
+            static_cast<void*>(dst.mutable_data<int8_t>(PlaceType::kGPU));
+        src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size));
+        data_len = data_size * sizeof(int8_t);
+        break;
+      case PaddleDType::FLOAT16:
+        dst_data = static_cast<void*>(
+            dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU));
+        src_data = static_cast<void*>(
+            src.data<paddle::platform::float16>(&src_place, &data_size));
+        data_len = data_size * 2;
+        break;
+      default:
+        PADDLE_THROW(paddle::platform::errors::Unimplemented(
+            "Only INT32, INT64, UINT8, INT8, FLOAT16 and "
+            "FLOAT32 is supported in Tensor. Others not implements"));
+    }
+    paddle::platform::DeviceContextPool& pool =
+        paddle::platform::DeviceContextPool::Instance();
+    paddle::platform::CUDAPlace gpu_place(dst.device_);
+    auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
+        pool.Get(gpu_place));
+    if (src.place() == PlaceType::kCPU) {
+      paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
+                           paddle::platform::CPUPlace(), src_data, data_len,
+                           dev_ctx->stream());
+    } else {
+      paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
+                           paddle::platform::CUDAPlace(), src_data, data_len,
+                           dev_ctx->stream());
+    }
+    if (nullptr != exec_stream) {
+      *(static_cast<cudaStream_t*>(exec_stream)) = dev_ctx->stream();
+    } else if (cb) {
+      cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
+    } else {
+      cudaStreamSynchronize(dev_ctx->stream());
+    }
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not copy tensor to GPU CUDA place because paddle is not compiled "
+        "with CUDA."));
+#endif
+  }
+  return;
+}
+void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) {
+  CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr);
+}
+void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
+                                  void* exec_stream) {
+  CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr);
+}
+void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
+                                  CallbackFunc cb, void* cb_params) {
+  CopyTensorImpl(p_dst, src, nullptr, cb, cb_params);
+}
+}  // namespace contrib
+}  // namespace paddle_infer
--- a/paddle/fluid/inference/api/paddle_infer_contrib.h
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+namespace paddle_infer {
+namespace contrib {
+class TensorUtils {
+ public:
+  static void* CudaMallocPinnedMemory(size_t size);
+  static void CudaFreePinnedMemory(void* mem);
+  static void CopyTensor(Tensor* p_dst, const Tensor& src);
+  static void CopyTensorAsync(Tensor* p_dst, const Tensor& src,
+                              void* exec_stream);
+  static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb,
+                              void* cb_params);
+ private:
+  static void CopyTensorImpl(Tensor* p_dst, const Tensor& src,
+                             void* exec_stream, CallbackFunc cb,
+                             void* cb_params);
+};
+}  // namespace contrib
+}  // namespace paddle_infer
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -18,6 +18,16 @@
 namespace paddle_infer {
+typedef void (*CallbackFunc)(void*);
+#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
+class InferApiTesterUtils;
+#endif
+namespace contrib {
+class TensorUtils;
+}
 /// \brief Paddle data type.
 enum DataType {
  FLOAT32,
@@ -72,7 +82,21 @@ class PD_INFER_DECL Tensor {
  /// It's usually used to get the output tensor data.
  /// \param[out] data The tensor will copy the data to the address.
  template <typename T>
-  void CopyToCpu(T* data);
+  void CopyToCpu(T* data) const;
+  /// \brief Copy the tensor data to the host memory asynchronously.
+  /// \param[out] data The tensor will copy the data to the address.
+  /// \param[out] exec_stream The tensor will excute copy in this stream(Only
+  /// GPU CUDA stream suppported now).
+  template <typename T>
+  void CopyToCpuAsync(T* data, void* exec_stream) const;
+  /// \brief Copy the tensor data to the host memory asynchronously.
+  /// \param[out] data The tensor will copy the data to the address.
+  /// \param[out] cb Callback function cb(cb_params) will be executed on the
+  /// host after all currently enqueued items in the stream have completed .
+  template <typename T>
+  void CopyToCpuAsync(T* data, CallbackFunc cb, void* cb_params) const;
  /// \brief Return the shape of the Tensor.
  std::vector<int> shape() const;
@@ -92,12 +116,20 @@ class PD_INFER_DECL Tensor {
  /// \return The data type of the tensor.
  DataType type() const;
+  /// \brief Return the place type of the tensor.
+  /// \return The place type of the tensor.
+  PlaceType place() const;
 protected:
  explicit Tensor(void* scope);
  void* FindTensor() const;
  void SetPlace(PlaceType place, int device = -1);
  void SetName(const std::string& name);
+  template <typename T>
+  void CopyToCpuImpl(T* data, void* stream = nullptr, CallbackFunc cb = nullptr,
+                     void* cb_params = nullptr) const;
  std::string name_;
  // The corresponding tensor pointer inside Paddle workspace is cached for
  // performance.
@@ -107,6 +139,11 @@ class PD_INFER_DECL Tensor {
  void* scope_{nullptr};
  PlaceType place_;
  int device_;
+  friend class paddle_infer::contrib::TensorUtils;
+#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
+  friend class paddle_infer::InferApiTesterUtils;
+#endif
 };
 }  // namespace paddle_infer
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -682,6 +682,11 @@ if(WITH_GPU)
    inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
        ARGS --infer_model=${RESNET50_MODEL_DIR})
+    inference_analysis_test(paddle_infer_api_copy_tensor_tester SRCS paddle_infer_api_copy_tensor_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${RESNET50_MODEL_DIR})
+    set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT 30)
 endif()
 if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")

--- a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <cstring>
+#include <numeric>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+#include "paddle/fluid/platform/float16.h"
+namespace paddle_infer {
+class InferApiTesterUtils {
+ public:
+  static std::unique_ptr<Tensor> CreateInferTensorForTest(
+      const std::string &name, PlaceType place, void *p_scope) {
+    auto var = static_cast<paddle::framework::Scope *>(p_scope)->Var(name);
+    var->GetMutable<paddle::framework::LoDTensor>();
+    std::unique_ptr<Tensor> res(new Tensor(p_scope));
+    res->input_or_output_ = true;
+    res->SetName(name);
+    res->SetPlace(place, 0 /*device id*/);
+    return res;
+  }
+};
+TEST(Tensor, copy_to_cpu_async_stream) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+  auto predictor = CreatePredictor(config);
+  auto pred_clone = predictor->Clone();
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+  std::vector<float> input(in_num, 1.0);
+  const auto &input_names = predictor->GetInputNames();
+  auto input_tensor = predictor->GetInputHandle(input_names[0]);
+  input_tensor->Reshape(in_shape);
+  input_tensor->CopyFromCpu(input.data());
+  predictor->Run();
+  const auto &output_names = predictor->GetOutputNames();
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_tensor->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  float *out_data = static_cast<float *>(
+      contrib::TensorUtils::CudaMallocPinnedMemory(sizeof(float) * out_num));
+  memset(out_data, 0, sizeof(float) * out_num);
+  std::vector<float> correct_out_data = {
+      127.78,   1.07353,  -229.42, 1127.28, -177.365,
+      -292.412, -271.614, 466.054, 540.436, -214.223,
+  };
+  for (int i = 0; i < 100; i++) {
+    predictor->Run();
+  }
+  cudaStream_t stream;
+  output_tensor->CopyToCpuAsync(out_data, static_cast<void *>(&stream));
+  // sync
+  cudaStreamSynchronize(stream);
+  for (int i = 0; i < 10; i++) {
+    EXPECT_NEAR(out_data[i] / correct_out_data[i], 1.0, 1e-3);
+  }
+  contrib::TensorUtils::CudaFreePinnedMemory(static_cast<void *>(out_data));
+}
+TEST(Tensor, copy_to_cpu_async_callback) {
+  LOG(INFO) << GetVersion();
+  UpdateDllFlag("conv_workspace_size_limit", "4000");
+  std::string model_dir = FLAGS_infer_model + "/model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableUseGpu(100, 0);
+  auto predictor = CreatePredictor(config);
+  auto pred_clone = predictor->Clone();
+  std::vector<int> in_shape = {1, 3, 318, 318};
+  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
+                               [](int &a, int &b) { return a * b; });
+  std::vector<float> input(in_num, 1.0);
+  const auto &input_names = predictor->GetInputNames();
+  auto input_tensor = predictor->GetInputHandle(input_names[0]);
+  input_tensor->Reshape(in_shape);
+  input_tensor->CopyFromCpu(input.data());
+  predictor->Run();
+  const auto &output_names = predictor->GetOutputNames();
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_tensor->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  float *out_data = static_cast<float *>(
+      contrib::TensorUtils::CudaMallocPinnedMemory(sizeof(float) * out_num));
+  memset(out_data, 0, sizeof(float) * out_num);
+  for (int i = 0; i < 100; i++) {
+    predictor->Run();
+  }
+  output_tensor->CopyToCpuAsync(
+      out_data,
+      [](void *cb_params) {
+        float *data = static_cast<float *>(cb_params);
+        std::vector<float> correct_out_data = {
+            127.78,   1.07353,  -229.42, 1127.28, -177.365,
+            -292.412, -271.614, 466.054, 540.436, -214.223,
+        };
+        for (int i = 0; i < 10; i++) {
+          EXPECT_NEAR(data[i] / correct_out_data[i], 1.0, 1e-3);
+        }
+      },
+      static_cast<void *>(out_data));
+  cudaDeviceSynchronize();
+  contrib::TensorUtils::CudaFreePinnedMemory(static_cast<void *>(out_data));
+}
+template <class DTYPE>
+static void test_copy_tensor(PlaceType src_place, PlaceType dst_place) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", src_place, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", dst_place, static_cast<void *>(&scope));
+  std::vector<DTYPE> data_src(6, 1);
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+  std::vector<DTYPE> data_dst(4, 2);
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+  std::vector<DTYPE> data_check(6, 3);
+  tensor_dst->CopyToCpu<DTYPE>(static_cast<DTYPE *>(data_check.data()));
+  for (int i = 0; i < 6; i++) {
+    EXPECT_NEAR(data_check[i], 1, 1e-5);
+  }
+}
+TEST(CopyTensor, float32) {
+  test_copy_tensor<float>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<float>(PlaceType::kCPU, PlaceType::kGPU);
+  test_copy_tensor<float>(PlaceType::kGPU, PlaceType::kGPU);
+}
+TEST(CopyTensor, int32) {
+  test_copy_tensor<int32_t>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<int32_t>(PlaceType::kGPU, PlaceType::kGPU);
+}
+TEST(CopyTensor, int64) {
+  test_copy_tensor<int64_t>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<int64_t>(PlaceType::kGPU, PlaceType::kGPU);
+}
+TEST(CopyTensor, int8) {
+  test_copy_tensor<int8_t>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<int8_t>(PlaceType::kGPU, PlaceType::kGPU);
+}
+TEST(CopyTensor, uint8) {
+  test_copy_tensor<uint8_t>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<uint8_t>(PlaceType::kGPU, PlaceType::kGPU);
+}
+TEST(CopyTensor, float16) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kCPU, static_cast<void *>(&scope));
+  using paddle::platform::float16;
+  std::vector<float16> data_src(6, float16(1.0));
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+  std::vector<float16> data_dst(4, float16(2.0));
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+  std::vector<float16> data_check(6, float16(1.0));
+  tensor_dst->CopyToCpu<float16>(data_check.data());
+  for (int i = 0; i < 6; i++) {
+    EXPECT_TRUE(data_check[i] == float16(1.0));
+  }
+}
+TEST(CopyTensor, float16_gpu) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
+  using paddle::platform::float16;
+  std::vector<float16> data_src(6, float16(1.0));
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+  std::vector<float16> data_dst(4, float16(2.0));
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+  std::vector<float16> data_check(6, float16(1.0));
+  tensor_dst->CopyToCpu<float16>(data_check.data());
+  for (int i = 0; i < 6; i++) {
+    EXPECT_TRUE(data_check[i] == float16(1.0));
+  }
+}
+TEST(CopyTensor, async_stream) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
+  std::vector<float> data_src(6, 1.0);
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+  std::vector<float> data_dst(4, 2.0);
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+  cudaStream_t stream;
+  paddle_infer::contrib::TensorUtils::CopyTensorAsync(
+      tensor_dst.get(), *tensor_src, static_cast<void *>(&stream));
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+  cudaStreamSynchronize(stream);
+  std::vector<float> data_check(6, 1.0);
+  tensor_dst->CopyToCpu<float>(data_check.data());
+  for (int i = 0; i < 6; i++) {
+    EXPECT_NEAR(data_check[i], static_cast<float>(1.0), 1e-5);
+  }
+}
+TEST(CopyTensor, async_callback) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
+  std::vector<float> data_src(6, 1.0);
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+  std::vector<float> data_dst(4, 2.0);
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+  paddle_infer::contrib::TensorUtils::CopyTensorAsync(
+      tensor_dst.get(), *tensor_src,
+      [](void *cb_params) {
+        Tensor *tensor = static_cast<Tensor *>(cb_params);
+        EXPECT_EQ(tensor->shape().size(), (size_t)2);
+        EXPECT_EQ(tensor->shape()[0], 2);
+        EXPECT_EQ(tensor->shape()[1], 3);
+      },
+      static_cast<void *>(&(*tensor_dst)));
+  cudaDeviceSynchronize();
+}
+}  // namespace paddle_infer
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -28,6 +28,7 @@
 #include <vector>
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
@@ -286,6 +287,12 @@ py::bytes SerializePDTensorToBytes(PaddleTensor &tensor) {  // NOLINT
  paddle::inference::SerializePDTensorToStream(&ss, tensor);
  return static_cast<py::bytes>(ss.str());
 }
+void CopyPaddleInferTensor(paddle_infer::Tensor &dst,
+                           const paddle_infer::Tensor &src) {
+  return paddle_infer::contrib::TensorUtils::CopyTensor(&dst, src);
+}
 }  // namespace
 void BindInferenceApi(py::module *m) {
@@ -317,6 +324,7 @@ void BindInferenceApi(py::module *m) {
                                           new paddle_infer::Predictor(config));
                                   return std::move(pred);
                                 });
+  m->def("copy_tensor", &CopyPaddleInferTensor);
  m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
  m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
  m->def("get_version", &paddle_infer::GetVersion);

--- a/python/paddle/inference/contrib/__init__.py
+++ b/python/paddle/inference/contrib/__init__.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/inference/contrib/utils/__init__.py
+++ b/python/paddle/inference/contrib/utils/__init__.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ....fluid.core import copy_tensor  # noqa: F401