diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc
index 51b27f8ca3ac07430cb0446ada11519f0ae08eff..b33e07b236bd7852b4832d23aecfe6af6fe40257 100644
--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -89,6 +89,12 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
                           cb,
                           cb_params);
         break;
+      case PaddleDType::BOOL:
+        src.CopyToCpuImpl(dst.mutable_data<bool>(PlaceType::kCPU),
+                          exec_stream,
+                          cb,
+                          cb_params);
+        break;
       case PaddleDType::FLOAT16:
         src.CopyToCpuImpl(
             dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU),
@@ -98,7 +104,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
         break;
       default:
         PADDLE_THROW(paddle::platform::errors::Unimplemented(
-            "Only INT32, INT64, UINT8, INT8, FLOAT16 and "
+            "Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16 and "
             "FLOAT32 is supported in Tensor. Others not implements"));
     }
     // gpu => gpu or cpu => gpu
@@ -142,6 +148,11 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
         src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size));
         data_len = data_size * sizeof(int8_t);
         break;
+      case PaddleDType::BOOL:
+        dst_data = static_cast<void*>(dst.mutable_data<bool>(PlaceType::kGPU));
+        src_data = static_cast<void*>(src.data<bool>(&src_place, &data_size));
+        data_len = data_size * sizeof(bool);
+        break;
       case PaddleDType::FLOAT16:
         dst_data = static_cast<void*>(
             dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU));
@@ -151,7 +162,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
         break;
       default:
         PADDLE_THROW(paddle::platform::errors::Unimplemented(
-            "Only INT32, INT64, UINT8, INT8, FLOAT16 and "
+            "Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16 and "
             "FLOAT32 is supported in Tensor. Others not implements"));
     }
 
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
index 7e6256b378e94ee44c00d2e45e3747dd97d03b87..85b778769c640ef5536465b763e24db6394c5044 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
 
+#include <array>
 #include <cstring>
 #include <numeric>
 
@@ -207,30 +208,167 @@ static void test_copy_tensor(PlaceType src_place, PlaceType dst_place) {
 TEST(CopyTensor, float32) {
   test_copy_tensor<float>(PlaceType::kCPU, PlaceType::kCPU);
   test_copy_tensor<float>(PlaceType::kCPU, PlaceType::kGPU);
+  test_copy_tensor<float>(PlaceType::kGPU, PlaceType::kCPU);
   test_copy_tensor<float>(PlaceType::kGPU, PlaceType::kGPU);
 }
 
 TEST(CopyTensor, int32) {
   test_copy_tensor<int32_t>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<int32_t>(PlaceType::kCPU, PlaceType::kGPU);
+  test_copy_tensor<int32_t>(PlaceType::kGPU, PlaceType::kCPU);
   test_copy_tensor<int32_t>(PlaceType::kGPU, PlaceType::kGPU);
 }
 
 TEST(CopyTensor, int64) {
   test_copy_tensor<int64_t>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<int64_t>(PlaceType::kCPU, PlaceType::kGPU);
+  test_copy_tensor<int64_t>(PlaceType::kGPU, PlaceType::kCPU);
   test_copy_tensor<int64_t>(PlaceType::kGPU, PlaceType::kGPU);
 }
 
 TEST(CopyTensor, int8) {
   test_copy_tensor<int8_t>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<int8_t>(PlaceType::kCPU, PlaceType::kGPU);
+  test_copy_tensor<int8_t>(PlaceType::kGPU, PlaceType::kCPU);
   test_copy_tensor<int8_t>(PlaceType::kGPU, PlaceType::kGPU);
 }
 
 TEST(CopyTensor, uint8) {
   test_copy_tensor<uint8_t>(PlaceType::kCPU, PlaceType::kCPU);
+  test_copy_tensor<uint8_t>(PlaceType::kCPU, PlaceType::kGPU);
+  test_copy_tensor<uint8_t>(PlaceType::kGPU, PlaceType::kCPU);
   test_copy_tensor<uint8_t>(PlaceType::kGPU, PlaceType::kGPU);
 }
 
-TEST(CopyTensor, float16) {
+TEST(CopyTensor, bool_cpu_to_cpu) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kCPU, static_cast<void *>(&scope));
+
+  std::array<bool, 6> data_src;
+  data_src.fill(true);
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+
+  std::array<bool, 4> data_dst;
+  data_dst.fill(false);
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+
+  std::array<bool, 6> data_check;
+  data_check.fill(false);
+  tensor_dst->CopyToCpu<bool>(data_check.data());
+
+  for (int i = 0; i < 6; i++) {
+    EXPECT_TRUE(data_check[i] == true);
+  }
+}
+
+TEST(CopyTensor, bool_gpu_to_gpu) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
+
+  std::array<bool, 6> data_src;
+  data_src.fill(true);
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+
+  std::array<bool, 4> data_dst;
+  data_dst.fill(false);
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+
+  std::array<bool, 6> data_check;
+  data_check.fill(false);
+  tensor_dst->CopyToCpu<bool>(data_check.data());
+
+  for (int i = 0; i < 6; i++) {
+    EXPECT_TRUE(data_check[i] == true);
+  }
+}
+
+TEST(CopyTensor, bool_gpu_to_cpu) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kCPU, static_cast<void *>(&scope));
+
+  std::array<bool, 6> data_src;
+  data_src.fill(true);
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+
+  std::array<bool, 4> data_dst;
+  data_dst.fill(false);
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+
+  std::array<bool, 6> data_check;
+  data_check.fill(false);
+  tensor_dst->CopyToCpu<bool>(data_check.data());
+
+  for (int i = 0; i < 6; i++) {
+    EXPECT_TRUE(data_check[i] == true);
+  }
+}
+
+TEST(CopyTensor, bool_cpu_to_gpu) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
+
+  std::array<bool, 6> data_src;
+  data_src.fill(true);
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+
+  std::array<bool, 4> data_dst;
+  data_dst.fill(false);
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+
+  std::array<bool, 6> data_check{false};
+  data_check.fill(false);
+  tensor_dst->CopyToCpu<bool>(data_check.data());
+
+  for (int i = 0; i < 6; i++) {
+    EXPECT_TRUE(data_check[i] == true);
+  }
+}
+
+TEST(CopyTensor, float16_cpu_to_cpu) {
   paddle::framework::Scope scope;
   auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
       "tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
@@ -252,7 +390,7 @@ TEST(CopyTensor, float16) {
   EXPECT_EQ(tensor_dst->shape()[0], 2);
   EXPECT_EQ(tensor_dst->shape()[1], 3);
 
-  std::vector<float16> data_check(6, float16(1.0));
+  std::vector<float16> data_check(6, float16(2.0));
   tensor_dst->CopyToCpu<float16>(data_check.data());
 
   for (int i = 0; i < 6; i++) {
@@ -260,7 +398,7 @@ TEST(CopyTensor, float16) {
   }
 }
 
-TEST(CopyTensor, float16_gpu) {
+TEST(CopyTensor, float16_gpu_to_gpu) {
   paddle::framework::Scope scope;
   auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
       "tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
@@ -282,7 +420,67 @@ TEST(CopyTensor, float16_gpu) {
   EXPECT_EQ(tensor_dst->shape()[0], 2);
   EXPECT_EQ(tensor_dst->shape()[1], 3);
 
-  std::vector<float16> data_check(6, float16(1.0));
+  std::vector<float16> data_check(6, float16(2.0));
+  tensor_dst->CopyToCpu<float16>(data_check.data());
+
+  for (int i = 0; i < 6; i++) {
+    EXPECT_TRUE(data_check[i] == float16(1.0));
+  }
+}
+
+TEST(CopyTensor, float16_cpu_to_gpu) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
+
+  using paddle::platform::float16;
+  std::vector<float16> data_src(6, float16(1.0));
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+
+  std::vector<float16> data_dst(4, float16(2.0));
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+
+  std::vector<float16> data_check(6, float16(2.0));
+  tensor_dst->CopyToCpu<float16>(data_check.data());
+
+  for (int i = 0; i < 6; i++) {
+    EXPECT_TRUE(data_check[i] == float16(1.0));
+  }
+}
+
+TEST(CopyTensor, float16_gpu_to_cpu) {
+  paddle::framework::Scope scope;
+  auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
+  auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
+      "tensor_dst", PlaceType::kCPU, static_cast<void *>(&scope));
+
+  using paddle::platform::float16;
+  std::vector<float16> data_src(6, float16(1.0));
+  tensor_src->Reshape({2, 3});
+  tensor_src->CopyFromCpu(data_src.data());
+
+  std::vector<float16> data_dst(4, float16(2.0));
+  tensor_dst->Reshape({2, 2});
+  tensor_dst->CopyFromCpu(data_dst.data());
+
+  paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
+
+  EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
+  EXPECT_EQ(tensor_dst->shape()[0], 2);
+  EXPECT_EQ(tensor_dst->shape()[1], 3);
+
+  std::vector<float16> data_check(6, float16(2.0));
   tensor_dst->CopyToCpu<float16>(data_check.data());
 
   for (int i = 0; i < 6; i++) {