Revert "Add copy from tensor (#34406)"

This reverts commit ac33c0ca.

Revert "Add copy from tensor (#34406)"
This reverts commit ac33c0ca.
2fa3ce2b · zhangchunle · GitHub · fa6c59a4 · 2fa3ce2b · 2fa3ce2b
11 changed file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -20,10 +20,6 @@ if(WITH_TESTING)
    add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
-if(WITH_INFERENCE_API_TEST)
-    add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
-endif(WITH_INFERENCE_API_TEST)
 if(NOT WITH_PROFILER)
    add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -28,15 +28,14 @@ if(WITH_MKLDNN)
 endif()
 cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
-cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 if(WITH_CRYPTO)
    cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
+              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
 else()
    cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
+              analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
 endif()
 if(WIN32)

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -121,8 +121,6 @@ DataType Tensor::type() const {
  return DataType::FLOAT32;
 }
-PlaceType Tensor::place() const { return place_; }
 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
  EAGER_GET_TENSOR;
@@ -187,8 +185,7 @@ void Tensor::CopyFromCpu(const T *data) {
 }
 template <typename T>
-void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
+void Tensor::CopyToCpu(T *data) {
-                           void *cb_params) const {
  EAGER_GET_TENSOR;
  auto ele_num = tensor->numel();
  auto *t_data = tensor->data<T>();
@@ -225,16 +222,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #ifdef PADDLE_WITH_HIP
    hipStreamSynchronize(dev_ctx->stream());
 #else
-    // async, return stream
+    cudaStreamSynchronize(dev_ctx->stream());
-    if (nullptr != exec_stream) {
-      *(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
-      // async with callback
-    } else if (cb) {
-      cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
-      // sync
-    } else {
-      cudaStreamSynchronize(dev_ctx->stream());
-    }
 #endif
 #else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
@@ -273,22 +261,6 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
  }
 }
-template <typename T>
-void Tensor::CopyToCpu(T *data) const {
-  CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
-}
-template <typename T>
-void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const {
-  CopyToCpuImpl<T>(data, exec_stream, nullptr, nullptr);
-}
-template <typename T>
-void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const {
-  CopyToCpuImpl<T>(data, nullptr, cb, cb_params);
-}
 template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int64_t>(const int64_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
@@ -296,38 +268,12 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data);
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
-    float *data, void *exec_stream) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
-    int64_t *data, void *exec_stream) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
-    int32_t *data, void *exec_stream) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
-    uint8_t *data, void *exec_stream) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
-    int8_t *data, void *exec_stream) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
-    float16 *data, void *exec_stream) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
-    float *data, CallbackFunc cb, void *cb_params) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
-    int64_t *data, CallbackFunc cb, void *cb_params) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
-    int32_t *data, CallbackFunc cb, void *cb_params) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
-    uint8_t *data, CallbackFunc cb, void *cb_params) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
-    int8_t *data, CallbackFunc cb, void *cb_params) const;
-template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
-    float16 *data, CallbackFunc cb, void *cb_params) const;
 template PD_INFER_DECL float *Tensor::data<float>(PlaceType *place,
                                                  int *size) const;
@@ -339,15 +285,12 @@ template PD_INFER_DECL uint8_t *Tensor::data<uint8_t>(PlaceType *place,
                                                      int *size) const;
 template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
                                                    int *size) const;
-template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
-                                                      int *size) const;
 template PD_INFER_DECL float *Tensor::mutable_data<float>(PlaceType place);
 template PD_INFER_DECL int64_t *Tensor::mutable_data<int64_t>(PlaceType place);
 template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
 template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
-template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
 Tensor::Tensor(void *scope) : scope_{scope} {
  PADDLE_ENFORCE_NOT_NULL(scope_,

--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-namespace paddle_infer {
-namespace contrib {
-using paddle::PaddleDType;
-void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
-#if defined(PADDLE_WITH_CUDA)
-  void* ptr = nullptr;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
-  return ptr;
-#else
-  return nullptr;
-#endif
-}
-void TensorUtils::CudaFreePinnedMemory(void* ptr) {
-#if defined(PADDLE_WITH_CUDA)
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
-#endif
-}
-void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src,
-                                 void* exec_stream, CallbackFunc cb,
-                                 void* cb_params) {
-  Tensor& dst = *p_dst;
-  dst.Reshape(src.shape());
-  PADDLE_ENFORCE(
-      src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU,
-      paddle::platform::errors::InvalidArgument(
-          "CopyTensor only support PlaceType kCPU/kGPU now."));
-  PADDLE_ENFORCE(
-      dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU,
-      paddle::platform::errors::InvalidArgument(
-          "CopyTensor only support PlaceType kCPU/kGPU now."));
-  // copy to cpu, gpu => cpu or cpu => cpu
-  if (dst.place() == PlaceType::kCPU) {
-    switch (src.type()) {
-      case PaddleDType::INT32:
-        src.CopyToCpuImpl(dst.mutable_data<int32_t>(PlaceType::kCPU),
-                          exec_stream, cb, cb_params);
-        break;
-      case PaddleDType::INT64:
-        src.CopyToCpuImpl(dst.mutable_data<int64_t>(PlaceType::kCPU),
-                          exec_stream, cb, cb_params);
-        break;
-      case PaddleDType::FLOAT32:
-        src.CopyToCpuImpl(dst.mutable_data<float>(PlaceType::kCPU), exec_stream,
-                          cb, cb_params);
-        break;
-      case PaddleDType::UINT8:
-        src.CopyToCpuImpl(dst.mutable_data<uint8_t>(PlaceType::kCPU),
-                          exec_stream, cb, cb_params);
-        break;
-      case PaddleDType::INT8:
-        src.CopyToCpuImpl(dst.mutable_data<int8_t>(PlaceType::kCPU),
-                          exec_stream, cb, cb_params);
-        break;
-      case PaddleDType::FLOAT16:
-        src.CopyToCpuImpl(
-            dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU),
-            exec_stream, cb, cb_params);
-        break;
-      default:
-        PADDLE_THROW(paddle::platform::errors::Unimplemented(
-            "Only INT32, INT64, UINT8, INT8, FLOAT16 and "
-            "FLOAT32 is supported in Tensor. Others not implements"));
-    }
-    // gpu => gpu or cpu => gpu
-  } else {
-#if defined(PADDLE_WITH_CUDA)
-    void* dst_data = nullptr;
-    void* src_data = nullptr;
-    size_t data_len = 0;
-    int data_size = 0;
-    PlaceType src_place;
-    switch (src.type()) {
-      case PaddleDType::INT32:
-        dst_data =
-            static_cast<void*>(dst.mutable_data<int32_t>(PlaceType::kGPU));
-        src_data =
-            static_cast<void*>(src.data<int32_t>(&src_place, &data_size));
-        data_len = data_size * sizeof(int32_t);
-        break;
-      case PaddleDType::INT64:
-        dst_data =
-            static_cast<void*>(dst.mutable_data<int64_t>(PlaceType::kGPU));
-        src_data =
-            static_cast<void*>(src.data<int64_t>(&src_place, &data_size));
-        data_len = data_size * sizeof(int64_t);
-        break;
-      case PaddleDType::FLOAT32:
-        dst_data = static_cast<void*>(dst.mutable_data<float>(PlaceType::kGPU));
-        src_data = static_cast<void*>(src.data<float>(&src_place, &data_size));
-        data_len = data_size * sizeof(float);
-        break;
-      case PaddleDType::UINT8:
-        dst_data =
-            static_cast<void*>(dst.mutable_data<uint8_t>(PlaceType::kGPU));
-        src_data =
-            static_cast<void*>(src.data<uint8_t>(&src_place, &data_size));
-        data_len = data_size * sizeof(uint8_t);
-        break;
-      case PaddleDType::INT8:
-        dst_data =
-            static_cast<void*>(dst.mutable_data<int8_t>(PlaceType::kGPU));
-        src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size));
-        data_len = data_size * sizeof(int8_t);
-        break;
-      case PaddleDType::FLOAT16:
-        dst_data = static_cast<void*>(
-            dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU));
-        src_data = static_cast<void*>(
-            src.data<paddle::platform::float16>(&src_place, &data_size));
-        data_len = data_size * 2;
-        break;
-      default:
-        PADDLE_THROW(paddle::platform::errors::Unimplemented(
-            "Only INT32, INT64, UINT8, INT8, FLOAT16 and "
-            "FLOAT32 is supported in Tensor. Others not implements"));
-    }
-    paddle::platform::DeviceContextPool& pool =
-        paddle::platform::DeviceContextPool::Instance();
-    paddle::platform::CUDAPlace gpu_place(dst.device_);
-    auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
-        pool.Get(gpu_place));
-    if (src.place() == PlaceType::kCPU) {
-      paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
-                           paddle::platform::CPUPlace(), src_data, data_len,
-                           dev_ctx->stream());
-    } else {
-      paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
-                           paddle::platform::CUDAPlace(), src_data, data_len,
-                           dev_ctx->stream());
-    }
-    if (nullptr != exec_stream) {
-      *(static_cast<cudaStream_t*>(exec_stream)) = dev_ctx->stream();
-    } else if (cb) {
-      cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
-    } else {
-      cudaStreamSynchronize(dev_ctx->stream());
-    }
-#else
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "Can not copy tensor to GPU CUDA place because paddle is not compiled "
-        "with CUDA."));
-#endif
-  }
-  return;
-}
-void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) {
-  CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr);
-}
-void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
-                                  void* exec_stream) {
-  CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr);
-}
-void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
-                                  CallbackFunc cb, void* cb_params) {
-  CopyTensorImpl(p_dst, src, nullptr, cb, cb_params);
-}
-}  // namespace contrib
-}  // namespace paddle_infer
--- a/paddle/fluid/inference/api/paddle_infer_contrib.h
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-namespace paddle_infer {
-namespace contrib {
-class TensorUtils {
- public:
-  static void* CudaMallocPinnedMemory(size_t size);
-  static void CudaFreePinnedMemory(void* mem);
-  static void CopyTensor(Tensor* p_dst, const Tensor& src);
-  static void CopyTensorAsync(Tensor* p_dst, const Tensor& src,
-                              void* exec_stream);
-  static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb,
-                              void* cb_params);
- private:
-  static void CopyTensorImpl(Tensor* p_dst, const Tensor& src,
-                             void* exec_stream, CallbackFunc cb,
-                             void* cb_params);
-};
-}  // namespace contrib
-}  // namespace paddle_infer
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -18,16 +18,6 @@
 namespace paddle_infer {
-typedef void (*CallbackFunc)(void*);
-#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
-class InferApiTesterUtils;
-#endif
-namespace contrib {
-class TensorUtils;
-}
 /// \brief Paddle data type.
 enum DataType {
  FLOAT32,
@@ -82,21 +72,7 @@ class PD_INFER_DECL Tensor {
  /// It's usually used to get the output tensor data.
  /// \param[out] data The tensor will copy the data to the address.
  template <typename T>
-  void CopyToCpu(T* data) const;
+  void CopyToCpu(T* data);
-  /// \brief Copy the tensor data to the host memory asynchronously.
-  /// \param[out] data The tensor will copy the data to the address.
-  /// \param[out] exec_stream The tensor will excute copy in this stream(Only
-  /// GPU CUDA stream suppported now).
-  template <typename T>
-  void CopyToCpuAsync(T* data, void* exec_stream) const;
-  /// \brief Copy the tensor data to the host memory asynchronously.
-  /// \param[out] data The tensor will copy the data to the address.
-  /// \param[out] cb Callback function cb(cb_params) will be executed on the
-  /// host after all currently enqueued items in the stream have completed .
-  template <typename T>
-  void CopyToCpuAsync(T* data, CallbackFunc cb, void* cb_params) const;
  /// \brief Return the shape of the Tensor.
  std::vector<int> shape() const;
@@ -116,20 +92,12 @@ class PD_INFER_DECL Tensor {
  /// \return The data type of the tensor.
  DataType type() const;
-  /// \brief Return the place type of the tensor.
-  /// \return The place type of the tensor.
-  PlaceType place() const;
 protected:
  explicit Tensor(void* scope);
  void* FindTensor() const;
  void SetPlace(PlaceType place, int device = -1);
  void SetName(const std::string& name);
-  template <typename T>
-  void CopyToCpuImpl(T* data, void* stream = nullptr, CallbackFunc cb = nullptr,
-                     void* cb_params = nullptr) const;
  std::string name_;
  // The corresponding tensor pointer inside Paddle workspace is cached for
  // performance.
@@ -139,11 +107,6 @@ class PD_INFER_DECL Tensor {
  void* scope_{nullptr};
  PlaceType place_;
  int device_;
-  friend class paddle_infer::contrib::TensorUtils;
-#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
-  friend class paddle_infer::InferApiTesterUtils;
-#endif
 };
 }  // namespace paddle_infer
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -682,11 +682,6 @@ if(WITH_GPU)
    inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
        ARGS --infer_model=${RESNET50_MODEL_DIR})
-    inference_analysis_test(paddle_infer_api_copy_tensor_tester SRCS paddle_infer_api_copy_tensor_tester.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR})
-    set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT 30)
 endif()
 if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")

--- a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <cstring>
-#include <numeric>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
-#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
-#include "paddle/fluid/platform/float16.h"
-namespace paddle_infer {
-class InferApiTesterUtils {
- public:
-  static std::unique_ptr<Tensor> CreateInferTensorForTest(
-      const std::string &name, PlaceType place, void *p_scope) {
-    auto var = static_cast<paddle::framework::Scope *>(p_scope)->Var(name);
-    var->GetMutable<paddle::framework::LoDTensor>();
-    std::unique_ptr<Tensor> res(new Tensor(p_scope));
-    res->input_or_output_ = true;
-    res->SetName(name);
-    res->SetPlace(place, 0 /*device id*/);
-    return res;
-  }
-};
-TEST(Tensor, copy_to_cpu_async_stream) {
-  LOG(INFO) << GetVersion();
-  UpdateDllFlag("conv_workspace_size_limit", "4000");
-  std::string model_dir = FLAGS_infer_model + "/model";
-  Config config;
-  config.SetModel(model_dir + "/model", model_dir + "/params");
-  config.EnableUseGpu(100, 0);
-  auto predictor = CreatePredictor(config);
-  auto pred_clone = predictor->Clone();
-  std::vector<int> in_shape = {1, 3, 318, 318};
-  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
-                               [](int &a, int &b) { return a * b; });
-  std::vector<float> input(in_num, 1.0);
-  const auto &input_names = predictor->GetInputNames();
-  auto input_tensor = predictor->GetInputHandle(input_names[0]);
-  input_tensor->Reshape(in_shape);
-  input_tensor->CopyFromCpu(input.data());
-  predictor->Run();
-  const auto &output_names = predictor->GetOutputNames();
-  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
-  std::vector<int> output_shape = output_tensor->shape();
-  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
-                                std::multiplies<int>());
-  float *out_data = static_cast<float *>(
-      contrib::TensorUtils::CudaMallocPinnedMemory(sizeof(float) * out_num));
-  memset(out_data, 0, sizeof(float) * out_num);
-  std::vector<float> correct_out_data = {
-      127.78,   1.07353,  -229.42, 1127.28, -177.365,
-      -292.412, -271.614, 466.054, 540.436, -214.223,
-  };
-  for (int i = 0; i < 100; i++) {
-    predictor->Run();
-  }
-  cudaStream_t stream;
-  output_tensor->CopyToCpuAsync(out_data, static_cast<void *>(&stream));
-  // sync
-  cudaStreamSynchronize(stream);
-  for (int i = 0; i < 10; i++) {
-    EXPECT_NEAR(out_data[i] / correct_out_data[i], 1.0, 1e-3);
-  }
-  contrib::TensorUtils::CudaFreePinnedMemory(static_cast<void *>(out_data));
-}
-TEST(Tensor, copy_to_cpu_async_callback) {
-  LOG(INFO) << GetVersion();
-  UpdateDllFlag("conv_workspace_size_limit", "4000");
-  std::string model_dir = FLAGS_infer_model + "/model";
-  Config config;
-  config.SetModel(model_dir + "/model", model_dir + "/params");
-  config.EnableUseGpu(100, 0);
-  auto predictor = CreatePredictor(config);
-  auto pred_clone = predictor->Clone();
-  std::vector<int> in_shape = {1, 3, 318, 318};
-  int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
-                               [](int &a, int &b) { return a * b; });
-  std::vector<float> input(in_num, 1.0);
-  const auto &input_names = predictor->GetInputNames();
-  auto input_tensor = predictor->GetInputHandle(input_names[0]);
-  input_tensor->Reshape(in_shape);
-  input_tensor->CopyFromCpu(input.data());
-  predictor->Run();
-  const auto &output_names = predictor->GetOutputNames();
-  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
-  std::vector<int> output_shape = output_tensor->shape();
-  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
-                                std::multiplies<int>());
-  float *out_data = static_cast<float *>(
-      contrib::TensorUtils::CudaMallocPinnedMemory(sizeof(float) * out_num));
-  memset(out_data, 0, sizeof(float) * out_num);
-  for (int i = 0; i < 100; i++) {
-    predictor->Run();
-  }
-  output_tensor->CopyToCpuAsync(
-      out_data,
-      [](void *cb_params) {
-        float *data = static_cast<float *>(cb_params);
-        std::vector<float> correct_out_data = {
-            127.78,   1.07353,  -229.42, 1127.28, -177.365,
-            -292.412, -271.614, 466.054, 540.436, -214.223,
-        };
-        for (int i = 0; i < 10; i++) {
-          EXPECT_NEAR(data[i] / correct_out_data[i], 1.0, 1e-3);
-        }
-      },
-      static_cast<void *>(out_data));
-  cudaDeviceSynchronize();
-  contrib::TensorUtils::CudaFreePinnedMemory(static_cast<void *>(out_data));
-}
-template <class DTYPE>
-static void test_copy_tensor(PlaceType src_place, PlaceType dst_place) {
-  paddle::framework::Scope scope;
-  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_src", src_place, static_cast<void *>(&scope));
-  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_dst", dst_place, static_cast<void *>(&scope));
-  std::vector<DTYPE> data_src(6, 1);
-  tensor_src->Reshape({2, 3});
-  tensor_src->CopyFromCpu(data_src.data());
-  std::vector<DTYPE> data_dst(4, 2);
-  tensor_dst->Reshape({2, 2});
-  tensor_dst->CopyFromCpu(data_dst.data());
-  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
-  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
-  EXPECT_EQ(tensor_dst->shape()[0], 2);
-  EXPECT_EQ(tensor_dst->shape()[1], 3);
-  std::vector<DTYPE> data_check(6, 3);
-  tensor_dst->CopyToCpu<DTYPE>(static_cast<DTYPE *>(data_check.data()));
-  for (int i = 0; i < 6; i++) {
-    EXPECT_NEAR(data_check[i], 1, 1e-5);
-  }
-}
-TEST(CopyTensor, float32) {
-  test_copy_tensor<float>(PlaceType::kCPU, PlaceType::kCPU);
-  test_copy_tensor<float>(PlaceType::kCPU, PlaceType::kGPU);
-  test_copy_tensor<float>(PlaceType::kGPU, PlaceType::kGPU);
-}
-TEST(CopyTensor, int32) {
-  test_copy_tensor<int32_t>(PlaceType::kCPU, PlaceType::kCPU);
-  test_copy_tensor<int32_t>(PlaceType::kGPU, PlaceType::kGPU);
-}
-TEST(CopyTensor, int64) {
-  test_copy_tensor<int64_t>(PlaceType::kCPU, PlaceType::kCPU);
-  test_copy_tensor<int64_t>(PlaceType::kGPU, PlaceType::kGPU);
-}
-TEST(CopyTensor, int8) {
-  test_copy_tensor<int8_t>(PlaceType::kCPU, PlaceType::kCPU);
-  test_copy_tensor<int8_t>(PlaceType::kGPU, PlaceType::kGPU);
-}
-TEST(CopyTensor, uint8) {
-  test_copy_tensor<uint8_t>(PlaceType::kCPU, PlaceType::kCPU);
-  test_copy_tensor<uint8_t>(PlaceType::kGPU, PlaceType::kGPU);
-}
-TEST(CopyTensor, float16) {
-  paddle::framework::Scope scope;
-  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
-  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_dst", PlaceType::kCPU, static_cast<void *>(&scope));
-  using paddle::platform::float16;
-  std::vector<float16> data_src(6, float16(1.0));
-  tensor_src->Reshape({2, 3});
-  tensor_src->CopyFromCpu(data_src.data());
-  std::vector<float16> data_dst(4, float16(2.0));
-  tensor_dst->Reshape({2, 2});
-  tensor_dst->CopyFromCpu(data_dst.data());
-  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
-  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
-  EXPECT_EQ(tensor_dst->shape()[0], 2);
-  EXPECT_EQ(tensor_dst->shape()[1], 3);
-  std::vector<float16> data_check(6, float16(1.0));
-  tensor_dst->CopyToCpu<float16>(data_check.data());
-  for (int i = 0; i < 6; i++) {
-    EXPECT_TRUE(data_check[i] == float16(1.0));
-  }
-}
-TEST(CopyTensor, float16_gpu) {
-  paddle::framework::Scope scope;
-  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
-  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
-  using paddle::platform::float16;
-  std::vector<float16> data_src(6, float16(1.0));
-  tensor_src->Reshape({2, 3});
-  tensor_src->CopyFromCpu(data_src.data());
-  std::vector<float16> data_dst(4, float16(2.0));
-  tensor_dst->Reshape({2, 2});
-  tensor_dst->CopyFromCpu(data_dst.data());
-  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
-  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
-  EXPECT_EQ(tensor_dst->shape()[0], 2);
-  EXPECT_EQ(tensor_dst->shape()[1], 3);
-  std::vector<float16> data_check(6, float16(1.0));
-  tensor_dst->CopyToCpu<float16>(data_check.data());
-  for (int i = 0; i < 6; i++) {
-    EXPECT_TRUE(data_check[i] == float16(1.0));
-  }
-}
-TEST(CopyTensor, async_stream) {
-  paddle::framework::Scope scope;
-  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
-  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
-  std::vector<float> data_src(6, 1.0);
-  tensor_src->Reshape({2, 3});
-  tensor_src->CopyFromCpu(data_src.data());
-  std::vector<float> data_dst(4, 2.0);
-  tensor_dst->Reshape({2, 2});
-  tensor_dst->CopyFromCpu(data_dst.data());
-  cudaStream_t stream;
-  paddle_infer::contrib::TensorUtils::CopyTensorAsync(
-      tensor_dst.get(), *tensor_src, static_cast<void *>(&stream));
-  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
-  EXPECT_EQ(tensor_dst->shape()[0], 2);
-  EXPECT_EQ(tensor_dst->shape()[1], 3);
-  cudaStreamSynchronize(stream);
-  std::vector<float> data_check(6, 1.0);
-  tensor_dst->CopyToCpu<float>(data_check.data());
-  for (int i = 0; i < 6; i++) {
-    EXPECT_NEAR(data_check[i], static_cast<float>(1.0), 1e-5);
-  }
-}
-TEST(CopyTensor, async_callback) {
-  paddle::framework::Scope scope;
-  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
-  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
-      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
-  std::vector<float> data_src(6, 1.0);
-  tensor_src->Reshape({2, 3});
-  tensor_src->CopyFromCpu(data_src.data());
-  std::vector<float> data_dst(4, 2.0);
-  tensor_dst->Reshape({2, 2});
-  tensor_dst->CopyFromCpu(data_dst.data());
-  paddle_infer::contrib::TensorUtils::CopyTensorAsync(
-      tensor_dst.get(), *tensor_src,
-      [](void *cb_params) {
-        Tensor *tensor = static_cast<Tensor *>(cb_params);
-        EXPECT_EQ(tensor->shape().size(), (size_t)2);
-        EXPECT_EQ(tensor->shape()[0], 2);
-        EXPECT_EQ(tensor->shape()[1], 3);
-      },
-      static_cast<void *>(&(*tensor_dst)));
-  cudaDeviceSynchronize();
-}
-}  // namespace paddle_infer
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -28,7 +28,6 @@
 #include <vector>
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
@@ -287,12 +286,6 @@ py::bytes SerializePDTensorToBytes(PaddleTensor &tensor) {  // NOLINT
  paddle::inference::SerializePDTensorToStream(&ss, tensor);
  return static_cast<py::bytes>(ss.str());
 }
-void CopyPaddleInferTensor(paddle_infer::Tensor &dst,
-                           const paddle_infer::Tensor &src) {
-  return paddle_infer::contrib::TensorUtils::CopyTensor(&dst, src);
-}
 }  // namespace
 void BindInferenceApi(py::module *m) {
@@ -324,7 +317,6 @@ void BindInferenceApi(py::module *m) {
                                           new paddle_infer::Predictor(config));
                                   return std::move(pred);
                                 });
-  m->def("copy_tensor", &CopyPaddleInferTensor);
  m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
  m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
  m->def("get_version", &paddle_infer::GetVersion);

--- a/python/paddle/inference/contrib/__init__.py
+++ b/python/paddle/inference/contrib/__init__.py
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/python/paddle/inference/contrib/utils/__init__.py
+++ b/python/paddle/inference/contrib/utils/__init__.py
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ....fluid.core import copy_tensor  # noqa: F401